1
0
Fork 0
remarkable-linux/net/rxrpc/ar-internal.h

1211 lines
38 KiB
C
Raw Normal View History

/* AF_RXRPC internal definitions
*
* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/atomic.h>
#include <linux/seqlock.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include "protocol.h"
#if 0
#define CHECK_SLAB_OKAY(X) \
BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \
(POISON_FREE << 8 | POISON_FREE))
#else
#define CHECK_SLAB_OKAY(X) do {} while (0)
#endif
#define FCRYPT_BSIZE 8
struct rxrpc_crypt {
union {
u8 x[FCRYPT_BSIZE];
__be32 n[2];
};
} __attribute__((aligned(8)));
[AF_RXRPC]: Add an interface to the AF_RXRPC module for the AFS filesystem to use Add an interface to the AF_RXRPC module so that the AFS filesystem module can more easily make use of the services available. AFS still opens a socket but then uses the action functions in lieu of sendmsg() and registers an intercept functions to grab messages before they're queued on the socket Rx queue. This permits AFS (or whatever) to: (1) Avoid the overhead of using the recvmsg() call. (2) Use different keys directly on individual client calls on one socket rather than having to open a whole slew of sockets, one for each key it might want to use. (3) Avoid calling request_key() at the point of issue of a call or opening of a socket. This is done instead by AFS at the point of open(), unlink() or other VFS operation and the key handed through. (4) Request the use of something other than GFP_KERNEL to allocate memory. Furthermore: (*) The socket buffer markings used by RxRPC are made available for AFS so that it can interpret the cooked RxRPC messages itself. (*) rxgen (un)marshalling abort codes are made available. The following documentation for the kernel interface is added to Documentation/networking/rxrpc.txt: ========================= AF_RXRPC KERNEL INTERFACE ========================= The AF_RXRPC module also provides an interface for use by in-kernel utilities such as the AFS filesystem. This permits such a utility to: (1) Use different keys directly on individual client calls on one socket rather than having to open a whole slew of sockets, one for each key it might want to use. (2) Avoid having RxRPC call request_key() at the point of issue of a call or opening of a socket. Instead the utility is responsible for requesting a key at the appropriate point. AFS, for instance, would do this during VFS operations such as open() or unlink(). The key is then handed through when the call is initiated. (3) Request the use of something other than GFP_KERNEL to allocate memory. (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be intercepted before they get put into the socket Rx queue and the socket buffers manipulated directly. To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, bind an addess as appropriate and listen if it's to be a server socket, but then it passes this to the kernel interface functions. The kernel interface functions are as follows: (*) Begin a new client call. struct rxrpc_call * rxrpc_kernel_begin_call(struct socket *sock, struct sockaddr_rxrpc *srx, struct key *key, unsigned long user_call_ID, gfp_t gfp); This allocates the infrastructure to make a new RxRPC call and assigns call and connection numbers. The call will be made on the UDP port that the socket is bound to. The call will go to the destination address of a connected client socket unless an alternative is supplied (srx is non-NULL). If a key is supplied then this will be used to secure the call instead of the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls secured in this way will still share connections if at all possible. The user_call_ID is equivalent to that supplied to sendmsg() in the control data buffer. It is entirely feasible to use this to point to a kernel data structure. If this function is successful, an opaque reference to the RxRPC call is returned. The caller now holds a reference on this and it must be properly ended. (*) End a client call. void rxrpc_kernel_end_call(struct rxrpc_call *call); This is used to end a previously begun call. The user_call_ID is expunged from AF_RXRPC's knowledge and will not be seen again in association with the specified call. (*) Send data through a call. int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, size_t len); This is used to supply either the request part of a client call or the reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the data buffers to be used. msg_iov may not be NULL and must point exclusively to in-kernel virtual addresses. msg.msg_flags may be given MSG_MORE if there will be subsequent data sends for this call. The msg must not specify a destination address, control data or any flags other than MSG_MORE. len is the total amount of data to transmit. (*) Abort a call. void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code); This is used to abort a call if it's still in an abortable state. The abort code specified will be placed in the ABORT message sent. (*) Intercept received RxRPC messages. typedef void (*rxrpc_interceptor_t)(struct sock *sk, unsigned long user_call_ID, struct sk_buff *skb); void rxrpc_kernel_intercept_rx_messages(struct socket *sock, rxrpc_interceptor_t interceptor); This installs an interceptor function on the specified AF_RXRPC socket. All messages that would otherwise wind up in the socket's Rx queue are then diverted to this function. Note that care must be taken to process the messages in the right order to maintain DATA message sequentiality. The interceptor function itself is provided with the address of the socket and handling the incoming message, the ID assigned by the kernel utility to the call and the socket buffer containing the message. The skb->mark field indicates the type of message: MARK MEANING =============================== ======================================= RXRPC_SKB_MARK_DATA Data message RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call RXRPC_SKB_MARK_BUSY Client call rejected as server busy RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer RXRPC_SKB_MARK_NET_ERROR Network error detected RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance The remote abort message can be probed with rxrpc_kernel_get_abort_code(). The two error messages can be probed with rxrpc_kernel_get_error_number(). A new call can be accepted with rxrpc_kernel_accept_call(). Data messages can have their contents extracted with the usual bunch of socket buffer manipulation functions. A data message can be determined to be the last one in a sequence with rxrpc_kernel_is_data_last(). When a data message has been used up, rxrpc_kernel_data_delivered() should be called on it.. Non-data messages should be handled to rxrpc_kernel_free_skb() to dispose of. It is possible to get extra refs on all types of message for later freeing, but this may pin the state of a call until the message is finally freed. (*) Accept an incoming call. struct rxrpc_call * rxrpc_kernel_accept_call(struct socket *sock, unsigned long user_call_ID); This is used to accept an incoming call and to assign it a call ID. This function is similar to rxrpc_kernel_begin_call() and calls accepted must be ended in the same way. If this function is successful, an opaque reference to the RxRPC call is returned. The caller now holds a reference on this and it must be properly ended. (*) Reject an incoming call. int rxrpc_kernel_reject_call(struct socket *sock); This is used to reject the first incoming call on the socket's queue with a BUSY message. -ENODATA is returned if there were no incoming calls. Other errors may be returned if the call had been aborted (-ECONNABORTED) or had timed out (-ETIME). (*) Record the delivery of a data message and free it. void rxrpc_kernel_data_delivered(struct sk_buff *skb); This is used to record a data message as having been delivered and to update the ACK state for the call. The socket buffer will be freed. (*) Free a message. void rxrpc_kernel_free_skb(struct sk_buff *skb); This is used to free a non-DATA socket buffer intercepted from an AF_RXRPC socket. (*) Determine if a data message is the last one on a call. bool rxrpc_kernel_is_data_last(struct sk_buff *skb); This is used to determine if a socket buffer holds the last data message to be received for a call (true will be returned if it does, false if not). The data message will be part of the reply on a client call and the request on an incoming call. In the latter case there will be more messages, but in the former case there will not. (*) Get the abort code from an abort message. u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb); This is used to extract the abort code from a remote abort message. (*) Get the error number from a local or network error message. int rxrpc_kernel_get_error_number(struct sk_buff *skb); This is used to extract the error number from a message indicating either a local error occurred or a network error occurred. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-26 16:50:17 -06:00
#define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS))
#define rxrpc_queue_delayed_work(WS,D) \
queue_delayed_work(rxrpc_workqueue, (WS), (D))
struct rxrpc_connection;
rxrpc: Don't expose skbs to in-kernel users [ver #2] Don't expose skbs to in-kernel users, such as the AFS filesystem, but instead provide a notification hook the indicates that a call needs attention and another that indicates that there's a new call to be collected. This makes the following possibilities more achievable: (1) Call refcounting can be made simpler if skbs don't hold refs to calls. (2) skbs referring to non-data events will be able to be freed much sooner rather than being queued for AFS to pick up as rxrpc_kernel_recv_data will be able to consult the call state. (3) We can shortcut the receive phase when a call is remotely aborted because we don't have to go through all the packets to get to the one cancelling the operation. (4) It makes it easier to do encryption/decryption directly between AFS's buffers and sk_buffs. (5) Encryption/decryption can more easily be done in the AFS's thread contexts - usually that of the userspace process that issued a syscall - rather than in one of rxrpc's background threads on a workqueue. (6) AFS will be able to wait synchronously on a call inside AF_RXRPC. To make this work, the following interface function has been added: int rxrpc_kernel_recv_data( struct socket *sock, struct rxrpc_call *call, void *buffer, size_t bufsize, size_t *_offset, bool want_more, u32 *_abort_code); This is the recvmsg equivalent. It allows the caller to find out about the state of a specific call and to transfer received data into a buffer piecemeal. afs_extract_data() and rxrpc_kernel_recv_data() now do all the extraction logic between them. They don't wait synchronously yet because the socket lock needs to be dealt with. Five interface functions have been removed: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() As a temporary hack, sk_buffs going to an in-kernel call are queued on the rxrpc_call struct (->knlrecv_queue) rather than being handed over to the in-kernel user. To process the queue internally, a temporary function, temp_deliver_data() has been added. This will be replaced with common code between the rxrpc_recvmsg() path and the kernel_rxrpc_recv_data() path in a future patch. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-30 13:42:14 -06:00
/*
* Mark applied to socket buffers.
*/
enum rxrpc_skb_mark {
RXRPC_SKB_MARK_DATA, /* data message */
RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */
RXRPC_SKB_MARK_BUSY, /* server busy message */
RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */
RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */
RXRPC_SKB_MARK_NET_ERROR, /* network error message */
RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */
RXRPC_SKB_MARK_NEW_CALL, /* local error message */
};
/*
* sk_state for RxRPC sockets
*/
enum {
RXRPC_UNBOUND = 0,
RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */
RXRPC_CLIENT_BOUND, /* client local address bound */
RXRPC_SERVER_BOUND, /* server local address bound */
RXRPC_SERVER_BOUND2, /* second server local address bound */
RXRPC_SERVER_LISTENING, /* server listening for connections */
RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */
RXRPC_CLOSE, /* socket is being closed */
};
/*
* Per-network namespace data.
*/
struct rxrpc_net {
struct proc_dir_entry *proc_net; /* Subdir in /proc/net */
u32 epoch; /* Local epoch for detecting local-end reset */
struct list_head calls; /* List of calls active in this namespace */
rwlock_t call_lock; /* Lock for ->calls */
struct list_head conn_proc_list; /* List of conns in this namespace for proc */
struct list_head service_conns; /* Service conns in this namespace */
rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */
struct delayed_work service_conn_reaper;
unsigned int nr_client_conns;
unsigned int nr_active_client_conns;
bool kill_all_client_conns;
bool live;
spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */
spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */
struct list_head waiting_client_conns;
struct list_head active_client_conns;
struct list_head idle_client_conns;
struct delayed_work client_conn_reaper;
struct list_head local_endpoints;
struct mutex local_mutex; /* Lock for ->local_endpoints */
spinlock_t peer_hash_lock; /* Lock for ->peer_hash */
DECLARE_HASHTABLE (peer_hash, 10);
};
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
/*
* Service backlog preallocation.
*
* This contains circular buffers of preallocated peers, connections and calls
* for incoming service calls and their head and tail pointers. This allows
* calls to be set up in the data_ready handler, thereby avoiding the need to
* shuffle packets around so much.
*/
struct rxrpc_backlog {
unsigned short peer_backlog_head;
unsigned short peer_backlog_tail;
unsigned short conn_backlog_head;
unsigned short conn_backlog_tail;
unsigned short call_backlog_head;
unsigned short call_backlog_tail;
#define RXRPC_BACKLOG_MAX 32
struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX];
struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX];
struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX];
};
/*
* RxRPC socket definition
*/
struct rxrpc_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
rxrpc: Don't expose skbs to in-kernel users [ver #2] Don't expose skbs to in-kernel users, such as the AFS filesystem, but instead provide a notification hook the indicates that a call needs attention and another that indicates that there's a new call to be collected. This makes the following possibilities more achievable: (1) Call refcounting can be made simpler if skbs don't hold refs to calls. (2) skbs referring to non-data events will be able to be freed much sooner rather than being queued for AFS to pick up as rxrpc_kernel_recv_data will be able to consult the call state. (3) We can shortcut the receive phase when a call is remotely aborted because we don't have to go through all the packets to get to the one cancelling the operation. (4) It makes it easier to do encryption/decryption directly between AFS's buffers and sk_buffs. (5) Encryption/decryption can more easily be done in the AFS's thread contexts - usually that of the userspace process that issued a syscall - rather than in one of rxrpc's background threads on a workqueue. (6) AFS will be able to wait synchronously on a call inside AF_RXRPC. To make this work, the following interface function has been added: int rxrpc_kernel_recv_data( struct socket *sock, struct rxrpc_call *call, void *buffer, size_t bufsize, size_t *_offset, bool want_more, u32 *_abort_code); This is the recvmsg equivalent. It allows the caller to find out about the state of a specific call and to transfer received data into a buffer piecemeal. afs_extract_data() and rxrpc_kernel_recv_data() now do all the extraction logic between them. They don't wait synchronously yet because the socket lock needs to be dealt with. Five interface functions have been removed: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() As a temporary hack, sk_buffs going to an in-kernel call are queued on the rxrpc_call struct (->knlrecv_queue) rather than being handed over to the in-kernel user. To process the queue internally, a temporary function, temp_deliver_data() has been added. This will be replaced with common code between the rxrpc_recvmsg() path and the kernel_rxrpc_recv_data() path in a future patch. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-30 13:42:14 -06:00
rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
struct rxrpc_local *local; /* local endpoint */
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct rxrpc_backlog *backlog; /* Preallocation for services */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
struct list_head sock_calls; /* List of calls owned by this socket */
struct list_head to_be_accepted; /* calls awaiting acceptance */
struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */
rwlock_t recvmsg_lock; /* Lock for recvmsg_q */
struct key *key; /* security for this socket */
struct key *securities; /* list of server security descriptors */
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct rb_root calls; /* User ID -> call mapping */
unsigned long flags;
#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
rwlock_t call_lock; /* lock for calls */
u32 min_sec_level; /* minimum security level */
#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
bool exclusive; /* Exclusive connection for a client socket */
u16 second_service; /* Additional service bound to the endpoint */
rxrpc: Implement service upgrade Implement AuriStor's service upgrade facility. There are three problems that this is meant to deal with: (1) Various of the standard AFS RPC calls have IPv4 addresses in their requests and/or replies - but there's no room for including IPv6 addresses. (2) Definition of IPv6-specific RPC operations in the standard operation sets has not yet been achieved. (3) One could envision the creation a new service on the same port that as the original service. The new service could implement improved operations - and the client could try this first, falling back to the original service if it's not there. Unfortunately, certain servers ignore packets addressed to a service they don't implement and don't respond in any way - not even with an ABORT. This means that the client must then wait for the call timeout to occur. What service upgrade does is to see if the connection is marked as being 'upgradeable' and if so, change the service ID in the server and thus the request and reply formats. Note that the upgrade isn't mandatory - a server that supports only the original call set will ignore the upgrade request. In the protocol, the procedure is then as follows: (1) To request an upgrade, the first DATA packet in a new connection must have the userStatus set to 1 (this is normally 0). The userStatus value is normally ignored by the server. (2) If the server doesn't support upgrading, the reply packets will contain the same service ID as for the first request packet. (3) If the server does support upgrading, all future reply packets on that connection will contain the new service ID and the new service ID will be applied to *all* further calls on that connection as well. (4) The RPC op used to probe the upgrade must take the same request data as the shadow call in the upgrade set (but may return a different reply). GetCapability RPC ops were added to all standard sets for just this purpose. Ops where the request formats differ cannot be used for probing. (5) The client must wait for completion of the probe before sending any further RPC ops to the same destination. It should then use the service ID that recvmsg() reported back in all future calls. (6) The shadow service must have call definitions for all the operation IDs defined by the original service. To support service upgrading, a server should: (1) Call bind() twice on its AF_RXRPC socket before calling listen(). Each bind() should supply a different service ID, but the transport addresses must be the same. This allows the server to receive requests with either service ID. (2) Enable automatic upgrading by calling setsockopt(), specifying RXRPC_UPGRADEABLE_SERVICE and passing in a two-member array of unsigned shorts as the argument: unsigned short optval[2]; This specifies a pair of service IDs. They must be different and must match the service IDs bound to the socket. Member 0 is the service ID to upgrade from and member 1 is the service ID to upgrade to. Signed-off-by: David Howells <dhowells@redhat.com>
2017-06-05 07:30:49 -06:00
struct {
/* Service upgrade information */
u16 from; /* Service ID to upgrade (if not 0) */
u16 to; /* service ID to upgrade to */
} service_upgrade;
sa_family_t family; /* Protocol family created with */
rxrpc: Implement service upgrade Implement AuriStor's service upgrade facility. There are three problems that this is meant to deal with: (1) Various of the standard AFS RPC calls have IPv4 addresses in their requests and/or replies - but there's no room for including IPv6 addresses. (2) Definition of IPv6-specific RPC operations in the standard operation sets has not yet been achieved. (3) One could envision the creation a new service on the same port that as the original service. The new service could implement improved operations - and the client could try this first, falling back to the original service if it's not there. Unfortunately, certain servers ignore packets addressed to a service they don't implement and don't respond in any way - not even with an ABORT. This means that the client must then wait for the call timeout to occur. What service upgrade does is to see if the connection is marked as being 'upgradeable' and if so, change the service ID in the server and thus the request and reply formats. Note that the upgrade isn't mandatory - a server that supports only the original call set will ignore the upgrade request. In the protocol, the procedure is then as follows: (1) To request an upgrade, the first DATA packet in a new connection must have the userStatus set to 1 (this is normally 0). The userStatus value is normally ignored by the server. (2) If the server doesn't support upgrading, the reply packets will contain the same service ID as for the first request packet. (3) If the server does support upgrading, all future reply packets on that connection will contain the new service ID and the new service ID will be applied to *all* further calls on that connection as well. (4) The RPC op used to probe the upgrade must take the same request data as the shadow call in the upgrade set (but may return a different reply). GetCapability RPC ops were added to all standard sets for just this purpose. Ops where the request formats differ cannot be used for probing. (5) The client must wait for completion of the probe before sending any further RPC ops to the same destination. It should then use the service ID that recvmsg() reported back in all future calls. (6) The shadow service must have call definitions for all the operation IDs defined by the original service. To support service upgrading, a server should: (1) Call bind() twice on its AF_RXRPC socket before calling listen(). Each bind() should supply a different service ID, but the transport addresses must be the same. This allows the server to receive requests with either service ID. (2) Enable automatic upgrading by calling setsockopt(), specifying RXRPC_UPGRADEABLE_SERVICE and passing in a two-member array of unsigned shorts as the argument: unsigned short optval[2]; This specifies a pair of service IDs. They must be different and must match the service IDs bound to the socket. Member 0 is the service ID to upgrade from and member 1 is the service ID to upgrade to. Signed-off-by: David Howells <dhowells@redhat.com>
2017-06-05 07:30:49 -06:00
struct sockaddr_rxrpc srx; /* Primary Service/local addresses */
struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */
};
#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
/*
* CPU-byteorder normalised Rx packet header.
*/
struct rxrpc_host_header {
u32 epoch; /* client boot timestamp */
u32 cid; /* connection and channel ID */
u32 callNumber; /* call ID (0 for connection-level packets) */
u32 seq; /* sequence number of pkt in call stream */
u32 serial; /* serial number of pkt sent to network */
u8 type; /* packet type */
u8 flags; /* packet flags */
u8 userStatus; /* app-layer defined status */
u8 securityIndex; /* security protocol ID */
union {
u16 _rsvd; /* reserved */
u16 cksum; /* kerberos security checksum */
};
u16 serviceId; /* service ID */
} __packed;
/*
* RxRPC socket buffer private variables
* - max 48 bytes (struct sk_buff::cb)
*/
struct rxrpc_skb_priv {
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
union {
u8 nr_jumbo; /* Number of jumbo subpackets */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
};
union {
int remain; /* amount of space remaining for next write */
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
};
#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
/*
* RxRPC security module interface
*/
struct rxrpc_security {
const char *name; /* name of this service */
u8 security_index; /* security type provided */
/* Initialise a security service */
int (*init)(void);
/* Clean up a security service */
void (*exit)(void);
/* initialise a connection's security */
int (*init_connection_security)(struct rxrpc_connection *);
/* prime a connection's packet security */
int (*prime_packet_security)(struct rxrpc_connection *);
/* impose security on a packet */
int (*secure_packet)(struct rxrpc_call *,
struct sk_buff *,
size_t,
void *);
/* verify the security on a received packet */
int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
unsigned int, unsigned int, rxrpc_seq_t, u16);
/* Locate the data in a received packet that has been verified. */
void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
unsigned int *, unsigned int *);
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
/* respond to a challenge */
int (*respond_to_challenge)(struct rxrpc_connection *,
struct sk_buff *,
u32 *);
/* verify a response */
int (*verify_response)(struct rxrpc_connection *,
struct sk_buff *,
u32 *);
/* clear connection security */
void (*clear)(struct rxrpc_connection *);
};
/*
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
* RxRPC local transport endpoint description
* - owned by a single AF_RXRPC socket
* - pointed to by transport socket struct sk_user_data
*/
struct rxrpc_local {
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
struct rcu_head rcu;
atomic_t usage;
struct rxrpc_net *rxnet; /* The network ns in which this resides */
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
struct list_head link;
struct socket *socket; /* my UDP socket */
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
struct work_struct processor;
struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
struct rb_root client_conns; /* Client connections by socket params */
spinlock_t client_conns_lock; /* Lock for client_conns */
spinlock_t lock; /* access lock */
rwlock_t services_lock; /* lock for services list */
int debug_id; /* debug ID for printks */
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
bool dead;
bool service_closed; /* Service socket closed */
struct sockaddr_rxrpc srx; /* local address */
};
/*
* RxRPC remote transport endpoint definition
* - matched by local endpoint, remote port, address and protocol type
*/
struct rxrpc_peer {
struct rcu_head rcu; /* This must be first */
atomic_t usage;
unsigned long hash_key;
struct hlist_node hash_link;
struct rxrpc_local *local;
struct hlist_head error_targets; /* targets for net error distribution */
struct work_struct error_distributor;
struct rb_root service_conns; /* Service connections */
seqlock_t service_conn_lock;
spinlock_t lock; /* access lock */
unsigned int if_mtu; /* interface MTU for this peer */
unsigned int mtu; /* network MTU for this peer */
unsigned int maxdata; /* data size (MTU - hdrsize) */
unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
int error_report; /* Net (+0) or local (+1000000) to distribute */
#define RXRPC_LOCAL_ERROR_OFFSET 1000000
struct sockaddr_rxrpc srx; /* remote address */
/* calculated RTT cache */
#define RXRPC_RTT_CACHE_SIZE 32
ktime_t rtt_last_req; /* Time of last RTT request */
u64 rtt; /* Current RTT estimate (in nS) */
u64 rtt_sum; /* Sum of cache contents */
u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
u8 rtt_cursor; /* next entry at which to insert */
u8 rtt_usage; /* amount of cache actually used */
u8 cong_cwnd; /* Congestion window size */
};
/*
* Keys for matching a connection.
*/
struct rxrpc_conn_proto {
union {
struct {
u32 epoch; /* epoch of this connection */
u32 cid; /* connection ID */
};
u64 index_key;
};
};
struct rxrpc_conn_parameters {
struct rxrpc_local *local; /* Representation of local endpoint */
struct rxrpc_peer *peer; /* Remote endpoint */
struct key *key; /* Security details */
bool exclusive; /* T if conn is exclusive */
bool upgrade; /* T if service ID can be upgraded */
u16 service_id; /* Service ID for this connection */
u32 security_level; /* Security level selected */
};
/*
* Bits in the connection flags.
*/
enum rxrpc_conn_flag {
RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */
rxrpc: Maintain an extra ref on a conn for the cache list Overhaul the usage count accounting for the rxrpc_connection struct to make it easier to implement RCU access from the data_ready handler. The problem is that currently we're using a lock to prevent the garbage collector from trying to clean up a connection that we're contemplating unidling. We could just stick incoming packets on the connection we find, but we've then got a problem that we may race when dispatching a work item to process it as we need to give that a ref to prevent the rxrpc_connection struct from disappearing in the meantime. Further, incoming packets may get discarded if attached to an rxrpc_connection struct that is going away. Whilst this is not a total disaster - the client will presumably resend - it would delay processing of the call. This would affect the AFS client filesystem's service manager operation. To this end: (1) We now maintain an extra count on the connection usage count whilst it is on the connection list. This mean it is not in use when its refcount is 1. (2) When trying to reuse an old connection, we only increment the refcount if it is greater than 0. If it is 0, we replace it in the tree with a new candidate connection. (3) Two connection flags are added to indicate whether or not a connection is in the local's client connection tree (used by sendmsg) or the peer's service connection tree (used by data_ready). This makes sure that we don't try and remove a connection if it got replaced. The flags are tested under lock with the removal operation to prevent the reaper from killing the rxrpc_connection struct whilst someone else is trying to effect a replacement. This could probably be alleviated by using memory barriers between the flag set/test and the rb_tree ops. The rb_tree op would still need to be under the lock, however. (4) When trying to reap an old connection, we try to flip the usage count from 1 to 0. If it's not 1 at that point, then it must've come back to life temporarily and we ignore it. Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-30 03:45:22 -06:00
RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */
RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */
RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */
RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */
RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */
};
/*
* Events that can be raised upon a connection.
*/
enum rxrpc_conn_event {
RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */
};
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
/*
* The connection cache state.
*/
enum rxrpc_conn_cache_state {
RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */
RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */
RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
RXRPC_CONN_CLIENT_UPGRADE, /* Conn is on active list, probing for upgrade */
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
RXRPC_CONN__NR_CACHE_STATES
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
};
/*
* The connection protocol state.
*/
enum rxrpc_conn_proto_state {
RXRPC_CONN_UNUSED, /* Connection not yet attempted */
RXRPC_CONN_CLIENT, /* Client connection */
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */
RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */
RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */
RXRPC_CONN_SERVICE, /* Service secured connection */
RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */
RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */
RXRPC_CONN__NR_STATES
};
/*
* RxRPC connection definition
* - matched by { local, peer, epoch, conn_id, direction }
* - each connection can only handle four simultaneous calls
*/
struct rxrpc_connection {
struct rxrpc_conn_proto proto;
struct rxrpc_conn_parameters params;
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
atomic_t usage;
struct rcu_head rcu;
struct list_head cache_link;
rxrpc: Call channels should have separate call number spaces Each channel on a connection has a separate, independent number space from which to allocate callNumber values. It is entirely possible, for example, to have a connection with four active calls, each with call number 1. Note that the callNumber values for any particular channel don't have to start at 1, but they are supposed to increment monotonically for that channel from a client's perspective and may not be reused once the call number is transmitted (until the epoch cycles all the way back round). Currently, however, call numbers are allocated on a per-connection basis and, further, are held in an rb-tree. The rb-tree is redundant as the four channel pointers in the rxrpc_connection struct are entirely capable of pointing to all the calls currently in progress on a connection. To this end, make the following changes: (1) Handle call number allocation independently per channel. (2) Get rid of the conn->calls rb-tree. This is overkill as a connection may have a maximum of four calls in progress at any one time. Use the pointers in the channels[] array instead, indexed by the channel number from the packet. (3) For each channel, save the result of the last call that was in progress on that channel in conn->channels[] so that the final ACK or ABORT packet can be replayed if necessary. Any call earlier than that is just ignored. If we've seen the next call number in a packet, the last one is most definitely defunct. (4) When generating a RESPONSE packet for a connection, the call number counter for each channel must be included in it. (5) When parsing a RESPONSE packet for a connection, the call number counters contained therein should be used to set the minimum expected call numbers on each channel. To do in future commits: (1) Replay terminal packets based on the last call stored in conn->channels[]. (2) Connections should be retired before the callNumber space on any channel runs out. (3) A server is expected to disregard or reject any new incoming call that has a call number less than the current call number counter. The call number counter for that channel must be advanced to the new call number. Note that the server cannot just require that the next call that it sees on a channel be exactly the call number counter + 1 because then there's a scenario that could cause a problem: The client transmits a packet to initiate a connection, the network goes out, the server sends an ACK (which gets lost), the client sends an ABORT (which also gets lost); the network then reconnects, the client then reuses the call number for the next call (it doesn't know the server already saw the call number), but the server thinks it already has the first packet of this call (it doesn't know that the client doesn't know that it saw the call number the first time). Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-27 07:39:44 -06:00
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
spinlock_t channel_lock;
unsigned char active_chans; /* Mask of active channels */
#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1)
struct list_head waiting_calls; /* Calls waiting for channels */
rxrpc: Call channels should have separate call number spaces Each channel on a connection has a separate, independent number space from which to allocate callNumber values. It is entirely possible, for example, to have a connection with four active calls, each with call number 1. Note that the callNumber values for any particular channel don't have to start at 1, but they are supposed to increment monotonically for that channel from a client's perspective and may not be reused once the call number is transmitted (until the epoch cycles all the way back round). Currently, however, call numbers are allocated on a per-connection basis and, further, are held in an rb-tree. The rb-tree is redundant as the four channel pointers in the rxrpc_connection struct are entirely capable of pointing to all the calls currently in progress on a connection. To this end, make the following changes: (1) Handle call number allocation independently per channel. (2) Get rid of the conn->calls rb-tree. This is overkill as a connection may have a maximum of four calls in progress at any one time. Use the pointers in the channels[] array instead, indexed by the channel number from the packet. (3) For each channel, save the result of the last call that was in progress on that channel in conn->channels[] so that the final ACK or ABORT packet can be replayed if necessary. Any call earlier than that is just ignored. If we've seen the next call number in a packet, the last one is most definitely defunct. (4) When generating a RESPONSE packet for a connection, the call number counter for each channel must be included in it. (5) When parsing a RESPONSE packet for a connection, the call number counters contained therein should be used to set the minimum expected call numbers on each channel. To do in future commits: (1) Replay terminal packets based on the last call stored in conn->channels[]. (2) Connections should be retired before the callNumber space on any channel runs out. (3) A server is expected to disregard or reject any new incoming call that has a call number less than the current call number counter. The call number counter for that channel must be advanced to the new call number. Note that the server cannot just require that the next call that it sees on a channel be exactly the call number counter + 1 because then there's a scenario that could cause a problem: The client transmits a packet to initiate a connection, the network goes out, the server sends an ACK (which gets lost), the client sends an ABORT (which also gets lost); the network then reconnects, the client then reuses the call number for the next call (it doesn't know the server already saw the call number), but the server thinks it already has the first packet of this call (it doesn't know that the client doesn't know that it saw the call number the first time). Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-27 07:39:44 -06:00
struct rxrpc_channel {
struct rxrpc_call __rcu *call; /* Active call */
u32 call_id; /* ID of current call */
u32 call_counter; /* Call ID counter */
u32 last_call; /* ID of last call */
u8 last_type; /* Type of last packet */
union {
u32 last_seq;
u32 last_abort;
};
rxrpc: Call channels should have separate call number spaces Each channel on a connection has a separate, independent number space from which to allocate callNumber values. It is entirely possible, for example, to have a connection with four active calls, each with call number 1. Note that the callNumber values for any particular channel don't have to start at 1, but they are supposed to increment monotonically for that channel from a client's perspective and may not be reused once the call number is transmitted (until the epoch cycles all the way back round). Currently, however, call numbers are allocated on a per-connection basis and, further, are held in an rb-tree. The rb-tree is redundant as the four channel pointers in the rxrpc_connection struct are entirely capable of pointing to all the calls currently in progress on a connection. To this end, make the following changes: (1) Handle call number allocation independently per channel. (2) Get rid of the conn->calls rb-tree. This is overkill as a connection may have a maximum of four calls in progress at any one time. Use the pointers in the channels[] array instead, indexed by the channel number from the packet. (3) For each channel, save the result of the last call that was in progress on that channel in conn->channels[] so that the final ACK or ABORT packet can be replayed if necessary. Any call earlier than that is just ignored. If we've seen the next call number in a packet, the last one is most definitely defunct. (4) When generating a RESPONSE packet for a connection, the call number counter for each channel must be included in it. (5) When parsing a RESPONSE packet for a connection, the call number counters contained therein should be used to set the minimum expected call numbers on each channel. To do in future commits: (1) Replay terminal packets based on the last call stored in conn->channels[]. (2) Connections should be retired before the callNumber space on any channel runs out. (3) A server is expected to disregard or reject any new incoming call that has a call number less than the current call number counter. The call number counter for that channel must be advanced to the new call number. Note that the server cannot just require that the next call that it sees on a channel be exactly the call number counter + 1 because then there's a scenario that could cause a problem: The client transmits a packet to initiate a connection, the network goes out, the server sends an ACK (which gets lost), the client sends an ABORT (which also gets lost); the network then reconnects, the client then reuses the call number for the next call (it doesn't know the server already saw the call number), but the server thinks it already has the first packet of this call (it doesn't know that the client doesn't know that it saw the call number the first time). Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-27 07:39:44 -06:00
} channels[RXRPC_MAXCALLS];
struct work_struct processor; /* connection event processor */
union {
struct rb_node client_node; /* Node in local->client_conns */
struct rb_node service_node; /* Node in peer->service_conns */
};
struct list_head proc_link; /* link in procfs list */
struct list_head link; /* link in master connection list */
struct sk_buff_head rx_queue; /* received conn-level packets */
const struct rxrpc_security *security; /* applied security module */
struct key *server_key; /* security for this service */
struct crypto_skcipher *cipher; /* encryption handle */
struct rxrpc_crypt csum_iv; /* packet checksum base */
2016-04-04 07:00:37 -06:00
unsigned long flags;
unsigned long events;
unsigned long idle_timestamp; /* Time at which last became idle */
spinlock_t state_lock; /* state-change lock */
enum rxrpc_conn_cache_state cache_state;
enum rxrpc_conn_proto_state state; /* current state of connection */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
u32 security_nonce; /* response re-use preventer */
u16 service_id; /* Service ID, possibly upgraded */
u8 size_align; /* data size alignment (for security) */
u8 security_size; /* security header size */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
/*
* Flags in call->flags.
*/
enum rxrpc_call_flag {
RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */
RXRPC_CALL_HAS_USERID, /* has a user ID attached */
RXRPC_CALL_IS_SERVICE, /* Call is service call */
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
RXRPC_CALL_EXPOSED, /* The call was exposed to the world */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
RXRPC_CALL_TX_LASTQ, /* Last packet has been queued */
RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
RXRPC_CALL_PINGING, /* Ping in process */
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
};
/*
* Events that can be raised on a call.
*/
enum rxrpc_call_event {
RXRPC_CALL_EV_ACK, /* need to generate ACK */
RXRPC_CALL_EV_ABORT, /* need to generate abort */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
RXRPC_CALL_EV_TIMER, /* Timer expired */
RXRPC_CALL_EV_RESEND, /* Tx resend required */
RXRPC_CALL_EV_PING, /* Ping send required */
};
/*
* The states that a call can be in.
*/
enum rxrpc_call_state {
RXRPC_CALL_UNINITIALISED,
RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */
RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */
RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
RXRPC_CALL_COMPLETE, /* - call complete */
NR__RXRPC_CALL_STATES
};
/*
* Call Tx congestion management modes.
*/
enum rxrpc_congest_mode {
RXRPC_CALL_SLOW_START,
RXRPC_CALL_CONGEST_AVOIDANCE,
RXRPC_CALL_PACKET_LOSS,
RXRPC_CALL_FAST_RETRANSMIT,
NR__RXRPC_CONGEST_MODES
};
/*
* RxRPC call definition
* - matched by { connection, call_id }
*/
struct rxrpc_call {
struct rcu_head rcu;
struct rxrpc_connection *conn; /* connection carrying call */
struct rxrpc_peer *peer; /* Peer record for remote address */
rxrpc: Calls shouldn't hold socket refs rxrpc calls shouldn't hold refs on the sock struct. This was done so that the socket wouldn't go away whilst the call was in progress, such that the call could reach the socket's queues. However, we can mark the socket as requiring an RCU release and rely on the RCU read lock. To make this work, we do: (1) rxrpc_release_call() removes the call's call user ID. This is now only called from socket operations and not from the call processor: rxrpc_accept_call() / rxrpc_kernel_accept_call() rxrpc_reject_call() / rxrpc_kernel_reject_call() rxrpc_kernel_end_call() rxrpc_release_calls_on_socket() rxrpc_recvmsg() Though it is also called in the cleanup path of rxrpc_accept_incoming_call() before we assign a user ID. (2) Pass the socket pointer into rxrpc_release_call() rather than getting it from the call so that we can get rid of uninitialised calls. (3) Fix call processor queueing to pass a ref to the work queue and to release that ref at the end of the processor function (or to pass it back to the work queue if we have to requeue). (4) Skip out of the call processor function asap if the call is complete and don't requeue it if the call is complete. (5) Clean up the call immediately that the refcount reaches 0 rather than trying to defer it. Actual deallocation is deferred to RCU, however. (6) Don't hold socket refs for allocated calls. (7) Use the RCU read lock when queueing a message on a socket and treat the call's socket pointer according to RCU rules and check it for NULL. We also need to use the RCU read lock when viewing a call through procfs. (8) Transmit the final ACK/ABORT to a client call in rxrpc_release_call() if this hasn't been done yet so that we can then disconnect the call. Once the call is disconnected, it won't have any access to the connection struct and the UDP socket for the call work processor to be able to send the ACK. Terminal retransmission will be handled by the connection processor. (9) Release all calls immediately on the closing of a socket rather than trying to defer this. Incomplete calls will be aborted. The call refcount model is much simplified. Refs are held on the call by: (1) A socket's user ID tree. (2) A socket's incoming call secureq and acceptq. (3) A kernel service that has a call in progress. (4) A queued call work processor. We have to take care to put any call that we failed to queue. (5) sk_buffs on a socket's receive queue. A future patch will get rid of this. Whilst we're at it, we can do: (1) Get rid of the RXRPC_CALL_EV_RELEASE event. Release is now done entirely from the socket routines and never from the call's processor. (2) Get rid of the RXRPC_CALL_DEAD state. Calls now end in the RXRPC_CALL_COMPLETE state. (3) Get rid of the rxrpc_call::destroyer work item. Calls are now torn down when their refcount reaches 0 and then handed over to RCU for final cleanup. (4) Get rid of the rxrpc_call::deadspan timer. Calls are cleaned up immediately they're finished with and don't hang around. Post-completion retransmission is handled by the connection processor once the call is disconnected. (5) Get rid of the dead call expiry setting as there's no longer a timer to set. (6) rxrpc_destroy_all_calls() can just check that the call list is empty. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-07 02:19:31 -06:00
struct rxrpc_sock __rcu *socket; /* socket responsible */
rxrpc: Fix deadlock between call creation and sendmsg/recvmsg All the routines by which rxrpc is accessed from the outside are serialised by means of the socket lock (sendmsg, recvmsg, bind, rxrpc_kernel_begin_call(), ...) and this presents a problem: (1) If a number of calls on the same socket are in the process of connection to the same peer, a maximum of four concurrent live calls are permitted before further calls need to wait for a slot. (2) If a call is waiting for a slot, it is deep inside sendmsg() or rxrpc_kernel_begin_call() and the entry function is holding the socket lock. (3) sendmsg() and recvmsg() or the in-kernel equivalents are prevented from servicing the other calls as they need to take the socket lock to do so. (4) The socket is stuck until a call is aborted and makes its slot available to the waiter. Fix this by: (1) Provide each call with a mutex ('user_mutex') that arbitrates access by the users of rxrpc separately for each specific call. (2) Make rxrpc_sendmsg() and rxrpc_recvmsg() unlock the socket as soon as they've got a call and taken its mutex. Note that I'm returning EWOULDBLOCK from recvmsg() if MSG_DONTWAIT is set but someone else has the lock. Should I instead only return EWOULDBLOCK if there's nothing currently to be done on a socket, and sleep in this particular instance because there is something to be done, but we appear to be blocked by the interrupt handler doing its ping? (3) Make rxrpc_new_client_call() unlock the socket after allocating a new call, locking its user mutex and adding it to the socket's call tree. The call is returned locked so that sendmsg() can add data to it immediately. From the moment the call is in the socket tree, it is subject to access by sendmsg() and recvmsg() - even if it isn't connected yet. (4) Lock new service calls in the UDP data_ready handler (in rxrpc_new_incoming_call()) because they may already be in the socket's tree and the data_ready handler makes them live immediately if a user ID has already been preassigned. Note that the new call is locked before any notifications are sent that it is live, so doing mutex_trylock() *ought* to always succeed. Userspace is prevented from doing sendmsg() on calls that are in a too-early state in rxrpc_do_sendmsg(). (5) Make rxrpc_new_incoming_call() return the call with the user mutex held so that a ping can be scheduled immediately under it. Note that it might be worth moving the ping call into rxrpc_new_incoming_call() and then we can drop the mutex there. (6) Make rxrpc_accept_call() take the lock on the call it is accepting and release the socket after adding the call to the socket's tree. This is slightly tricky as we've dequeued the call by that point and have to requeue it. Note that requeuing emits a trace event. (7) Make rxrpc_kernel_send_data() and rxrpc_kernel_recv_data() take the new mutex immediately and don't bother with the socket mutex at all. This patch has the nice bonus that calls on the same socket are now to some extent parallelisable. Note that we might want to move rxrpc_service_prealloc() calls out from the socket lock and give it its own lock, so that we don't hang progress in other calls because we're waiting for the allocator. We probably also want to avoid calling rxrpc_notify_socket() from within the socket lock (rxrpc_accept_call()). Signed-off-by: David Howells <dhowells@redhat.com> Tested-by: Marc Dionne <marc.c.dionne@auristor.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-27 08:43:06 -07:00
struct mutex user_mutex; /* User access mutex */
ktime_t ack_at; /* When deferred ACK needs to happen */
ktime_t resend_at; /* When next resend needs to happen */
ktime_t ping_at; /* When next to send a ping */
ktime_t expire_at; /* When the call times out */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct timer_list timer; /* Combined event timer */
struct work_struct processor; /* Event processor */
rxrpc: Don't expose skbs to in-kernel users [ver #2] Don't expose skbs to in-kernel users, such as the AFS filesystem, but instead provide a notification hook the indicates that a call needs attention and another that indicates that there's a new call to be collected. This makes the following possibilities more achievable: (1) Call refcounting can be made simpler if skbs don't hold refs to calls. (2) skbs referring to non-data events will be able to be freed much sooner rather than being queued for AFS to pick up as rxrpc_kernel_recv_data will be able to consult the call state. (3) We can shortcut the receive phase when a call is remotely aborted because we don't have to go through all the packets to get to the one cancelling the operation. (4) It makes it easier to do encryption/decryption directly between AFS's buffers and sk_buffs. (5) Encryption/decryption can more easily be done in the AFS's thread contexts - usually that of the userspace process that issued a syscall - rather than in one of rxrpc's background threads on a workqueue. (6) AFS will be able to wait synchronously on a call inside AF_RXRPC. To make this work, the following interface function has been added: int rxrpc_kernel_recv_data( struct socket *sock, struct rxrpc_call *call, void *buffer, size_t bufsize, size_t *_offset, bool want_more, u32 *_abort_code); This is the recvmsg equivalent. It allows the caller to find out about the state of a specific call and to transfer received data into a buffer piecemeal. afs_extract_data() and rxrpc_kernel_recv_data() now do all the extraction logic between them. They don't wait synchronously yet because the socket lock needs to be dealt with. Five interface functions have been removed: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() As a temporary hack, sk_buffs going to an in-kernel call are queued on the rxrpc_call struct (->knlrecv_queue) rather than being handed over to the in-kernel user. To process the queue internally, a temporary function, temp_deliver_data() has been added. This will be replaced with common code between the rxrpc_recvmsg() path and the kernel_rxrpc_recv_data() path in a future patch. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-30 13:42:14 -06:00
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
struct list_head link; /* link in master call list */
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
struct list_head chan_wait_link; /* Link in conn->waiting_calls */
struct hlist_node error_link; /* link in error distribution list */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct list_head accept_link; /* Link in rx->acceptq */
struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
struct list_head sock_link; /* Link in rx->sock_calls */
struct rb_node sock_node; /* Node in rx->calls */
struct sk_buff *tx_pending; /* Tx socket buffer being filled */
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
wait_queue_head_t waitq; /* Wait queue for channel or Tx */
s64 tx_total_len; /* Total length left to be transmitted (or -1) */
__be32 crypto_buf[2]; /* Temporary packet crypto buffer */
unsigned long user_call_ID; /* user-defined call ID */
unsigned long flags;
unsigned long events;
spinlock_t lock;
rwlock_t state_lock; /* lock for state transition */
u32 abort_code; /* Local/remote abort code */
int error; /* Local error incurred */
enum rxrpc_call_state state; /* current state of call */
enum rxrpc_call_completion completion; /* Call completion condition */
atomic_t usage;
u16 service_id; /* service ID */
u8 security_ix; /* Security type */
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
int debug_id; /* debug ID for printks */
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
/* Rx/Tx circular buffer, depending on phase.
*
* In the Rx phase, packets are annotated with 0 or the number of the
* segment of a jumbo packet each buffer refers to. There can be up to
* 47 segments in a maximum-size UDP packet.
*
* In the Tx phase, packets are annotated with which buffers have been
* acked.
*/
#define RXRPC_RXTX_BUFF_SIZE 64
#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
#define RXRPC_INIT_RX_WINDOW_SIZE 32
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct sk_buff **rxtx_buffer;
u8 *rxtx_annotations;
#define RXRPC_TX_ANNO_ACK 0
#define RXRPC_TX_ANNO_UNACK 1
#define RXRPC_TX_ANNO_NAK 2
#define RXRPC_TX_ANNO_RETRANS 3
#define RXRPC_TX_ANNO_MASK 0x03
rxrpc: Pass the last Tx packet marker in the annotation buffer When the last packet of data to be transmitted on a call is queued, tx_top is set and then the RXRPC_CALL_TX_LAST flag is set. Unfortunately, this leaves a race in the ACK processing side of things because the flag affects the interpretation of tx_top and also allows us to start receiving reply data before we've finished transmitting. To fix this, make the following changes: (1) rxrpc_queue_packet() now sets a marker in the annotation buffer instead of setting the RXRPC_CALL_TX_LAST flag. (2) rxrpc_rotate_tx_window() detects the marker and sets the flag in the same context as the routines that use it. (3) rxrpc_end_tx_phase() is simplified to just shift the call state. The Tx window must have been rotated before calling to discard the last packet. (4) rxrpc_receiving_reply() is added to handle the arrival of the first DATA packet of a reply to a client call (which is an implicit ACK of the Tx phase). (5) The last part of rxrpc_input_ack() is reordered to perform Tx rotation, then soft-ACK application and then to end the phase if we've rotated the last packet. In the event of a terminal ACK, the soft-ACK application will be skipped as nAcks should be 0. (6) rxrpc_input_ackall() now has to rotate as well as ending the phase. In addition: (7) Alter the transmit tracepoint to log the rotation of the last packet. (8) Remove the no-longer relevant queue_reqack tracepoint note. The ACK-REQUESTED packet header flag is now set as needed when we actually transmit the packet and may vary by retransmission. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-23 05:39:22 -06:00
#define RXRPC_TX_ANNO_LAST 0x04
#define RXRPC_TX_ANNO_RESENT 0x08
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */
#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */
#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but
* not hard-ACK'd packet follows this.
*/
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
/* TCP-style slow-start congestion control [RFC5681]. Since the SMSS
* is fixed, we keep these numbers in terms of segments (ie. DATA
* packets) rather than bytes.
*/
#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
u8 cong_cwnd; /* Congestion window size */
u8 cong_extra; /* Extra to send for congestion management */
u8 cong_ssthresh; /* Slow-start threshold */
enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
u8 cong_dup_acks; /* Count of ACKs showing missing packets */
u8 cong_cumul_acks; /* Cumulative ACK count */
ktime_t cong_tstamp; /* Last time cwnd was changed */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not
* consumed packet follows this.
*/
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
u8 rx_winsize; /* Size of Rx window */
u8 tx_winsize; /* Maximum size of Tx window */
bool tx_phase; /* T if transmission phase, F if receive phase */
u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
/* receive-phase ACK management */
u8 ackr_reason; /* reason to ACK */
u16 ackr_skew; /* skew on packet being ACK'd */
rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */
rxrpc_seq_t ackr_seen; /* Highest packet shown seen */
/* ping management */
rxrpc_serial_t ping_serial; /* Last ping sent */
ktime_t ping_time; /* Time last ping sent */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
/* transmission-phase ACK management */
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
};
/*
* Summary of a new ACK and the changes it made to the Tx buffer packet states.
*/
struct rxrpc_ack_summary {
u8 ack_reason;
u8 nr_acks; /* Number of ACKs in packet */
u8 nr_nacks; /* Number of NACKs in packet */
u8 nr_new_acks; /* Number of new ACKs in packet */
u8 nr_new_nacks; /* Number of new NACKs in packet */
u8 nr_rot_new_acks; /* Number of rotated new ACKs */
bool new_low_nack; /* T if new low NACK found */
bool retrans_timeo; /* T if reTx due to timeout happened */
u8 flight_size; /* Number of unreceived transmissions */
/* Place to stash values for tracing */
enum rxrpc_congest_mode mode:8;
u8 cwnd;
u8 ssthresh;
u8 dup_acks;
u8 cumulative_acks;
};
#include <trace/events/rxrpc.h>
/*
[AF_RXRPC]: Add an interface to the AF_RXRPC module for the AFS filesystem to use Add an interface to the AF_RXRPC module so that the AFS filesystem module can more easily make use of the services available. AFS still opens a socket but then uses the action functions in lieu of sendmsg() and registers an intercept functions to grab messages before they're queued on the socket Rx queue. This permits AFS (or whatever) to: (1) Avoid the overhead of using the recvmsg() call. (2) Use different keys directly on individual client calls on one socket rather than having to open a whole slew of sockets, one for each key it might want to use. (3) Avoid calling request_key() at the point of issue of a call or opening of a socket. This is done instead by AFS at the point of open(), unlink() or other VFS operation and the key handed through. (4) Request the use of something other than GFP_KERNEL to allocate memory. Furthermore: (*) The socket buffer markings used by RxRPC are made available for AFS so that it can interpret the cooked RxRPC messages itself. (*) rxgen (un)marshalling abort codes are made available. The following documentation for the kernel interface is added to Documentation/networking/rxrpc.txt: ========================= AF_RXRPC KERNEL INTERFACE ========================= The AF_RXRPC module also provides an interface for use by in-kernel utilities such as the AFS filesystem. This permits such a utility to: (1) Use different keys directly on individual client calls on one socket rather than having to open a whole slew of sockets, one for each key it might want to use. (2) Avoid having RxRPC call request_key() at the point of issue of a call or opening of a socket. Instead the utility is responsible for requesting a key at the appropriate point. AFS, for instance, would do this during VFS operations such as open() or unlink(). The key is then handed through when the call is initiated. (3) Request the use of something other than GFP_KERNEL to allocate memory. (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be intercepted before they get put into the socket Rx queue and the socket buffers manipulated directly. To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, bind an addess as appropriate and listen if it's to be a server socket, but then it passes this to the kernel interface functions. The kernel interface functions are as follows: (*) Begin a new client call. struct rxrpc_call * rxrpc_kernel_begin_call(struct socket *sock, struct sockaddr_rxrpc *srx, struct key *key, unsigned long user_call_ID, gfp_t gfp); This allocates the infrastructure to make a new RxRPC call and assigns call and connection numbers. The call will be made on the UDP port that the socket is bound to. The call will go to the destination address of a connected client socket unless an alternative is supplied (srx is non-NULL). If a key is supplied then this will be used to secure the call instead of the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls secured in this way will still share connections if at all possible. The user_call_ID is equivalent to that supplied to sendmsg() in the control data buffer. It is entirely feasible to use this to point to a kernel data structure. If this function is successful, an opaque reference to the RxRPC call is returned. The caller now holds a reference on this and it must be properly ended. (*) End a client call. void rxrpc_kernel_end_call(struct rxrpc_call *call); This is used to end a previously begun call. The user_call_ID is expunged from AF_RXRPC's knowledge and will not be seen again in association with the specified call. (*) Send data through a call. int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, size_t len); This is used to supply either the request part of a client call or the reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the data buffers to be used. msg_iov may not be NULL and must point exclusively to in-kernel virtual addresses. msg.msg_flags may be given MSG_MORE if there will be subsequent data sends for this call. The msg must not specify a destination address, control data or any flags other than MSG_MORE. len is the total amount of data to transmit. (*) Abort a call. void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code); This is used to abort a call if it's still in an abortable state. The abort code specified will be placed in the ABORT message sent. (*) Intercept received RxRPC messages. typedef void (*rxrpc_interceptor_t)(struct sock *sk, unsigned long user_call_ID, struct sk_buff *skb); void rxrpc_kernel_intercept_rx_messages(struct socket *sock, rxrpc_interceptor_t interceptor); This installs an interceptor function on the specified AF_RXRPC socket. All messages that would otherwise wind up in the socket's Rx queue are then diverted to this function. Note that care must be taken to process the messages in the right order to maintain DATA message sequentiality. The interceptor function itself is provided with the address of the socket and handling the incoming message, the ID assigned by the kernel utility to the call and the socket buffer containing the message. The skb->mark field indicates the type of message: MARK MEANING =============================== ======================================= RXRPC_SKB_MARK_DATA Data message RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call RXRPC_SKB_MARK_BUSY Client call rejected as server busy RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer RXRPC_SKB_MARK_NET_ERROR Network error detected RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance The remote abort message can be probed with rxrpc_kernel_get_abort_code(). The two error messages can be probed with rxrpc_kernel_get_error_number(). A new call can be accepted with rxrpc_kernel_accept_call(). Data messages can have their contents extracted with the usual bunch of socket buffer manipulation functions. A data message can be determined to be the last one in a sequence with rxrpc_kernel_is_data_last(). When a data message has been used up, rxrpc_kernel_data_delivered() should be called on it.. Non-data messages should be handled to rxrpc_kernel_free_skb() to dispose of. It is possible to get extra refs on all types of message for later freeing, but this may pin the state of a call until the message is finally freed. (*) Accept an incoming call. struct rxrpc_call * rxrpc_kernel_accept_call(struct socket *sock, unsigned long user_call_ID); This is used to accept an incoming call and to assign it a call ID. This function is similar to rxrpc_kernel_begin_call() and calls accepted must be ended in the same way. If this function is successful, an opaque reference to the RxRPC call is returned. The caller now holds a reference on this and it must be properly ended. (*) Reject an incoming call. int rxrpc_kernel_reject_call(struct socket *sock); This is used to reject the first incoming call on the socket's queue with a BUSY message. -ENODATA is returned if there were no incoming calls. Other errors may be returned if the call had been aborted (-ECONNABORTED) or had timed out (-ETIME). (*) Record the delivery of a data message and free it. void rxrpc_kernel_data_delivered(struct sk_buff *skb); This is used to record a data message as having been delivered and to update the ACK state for the call. The socket buffer will be freed. (*) Free a message. void rxrpc_kernel_free_skb(struct sk_buff *skb); This is used to free a non-DATA socket buffer intercepted from an AF_RXRPC socket. (*) Determine if a data message is the last one on a call. bool rxrpc_kernel_is_data_last(struct sk_buff *skb); This is used to determine if a socket buffer holds the last data message to be received for a call (true will be returned if it does, false if not). The data message will be part of the reply on a client call and the request on an incoming call. In the latter case there will be more messages, but in the former case there will not. (*) Get the abort code from an abort message. u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb); This is used to extract the abort code from a remote abort message. (*) Get the error number from a local or network error message. int rxrpc_kernel_get_error_number(struct sk_buff *skb); This is used to extract the error number from a message indicating either a local error occurred or a network error occurred. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-26 16:50:17 -06:00
* af_rxrpc.c
*/
extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
[AF_RXRPC]: Add an interface to the AF_RXRPC module for the AFS filesystem to use Add an interface to the AF_RXRPC module so that the AFS filesystem module can more easily make use of the services available. AFS still opens a socket but then uses the action functions in lieu of sendmsg() and registers an intercept functions to grab messages before they're queued on the socket Rx queue. This permits AFS (or whatever) to: (1) Avoid the overhead of using the recvmsg() call. (2) Use different keys directly on individual client calls on one socket rather than having to open a whole slew of sockets, one for each key it might want to use. (3) Avoid calling request_key() at the point of issue of a call or opening of a socket. This is done instead by AFS at the point of open(), unlink() or other VFS operation and the key handed through. (4) Request the use of something other than GFP_KERNEL to allocate memory. Furthermore: (*) The socket buffer markings used by RxRPC are made available for AFS so that it can interpret the cooked RxRPC messages itself. (*) rxgen (un)marshalling abort codes are made available. The following documentation for the kernel interface is added to Documentation/networking/rxrpc.txt: ========================= AF_RXRPC KERNEL INTERFACE ========================= The AF_RXRPC module also provides an interface for use by in-kernel utilities such as the AFS filesystem. This permits such a utility to: (1) Use different keys directly on individual client calls on one socket rather than having to open a whole slew of sockets, one for each key it might want to use. (2) Avoid having RxRPC call request_key() at the point of issue of a call or opening of a socket. Instead the utility is responsible for requesting a key at the appropriate point. AFS, for instance, would do this during VFS operations such as open() or unlink(). The key is then handed through when the call is initiated. (3) Request the use of something other than GFP_KERNEL to allocate memory. (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be intercepted before they get put into the socket Rx queue and the socket buffers manipulated directly. To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, bind an addess as appropriate and listen if it's to be a server socket, but then it passes this to the kernel interface functions. The kernel interface functions are as follows: (*) Begin a new client call. struct rxrpc_call * rxrpc_kernel_begin_call(struct socket *sock, struct sockaddr_rxrpc *srx, struct key *key, unsigned long user_call_ID, gfp_t gfp); This allocates the infrastructure to make a new RxRPC call and assigns call and connection numbers. The call will be made on the UDP port that the socket is bound to. The call will go to the destination address of a connected client socket unless an alternative is supplied (srx is non-NULL). If a key is supplied then this will be used to secure the call instead of the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls secured in this way will still share connections if at all possible. The user_call_ID is equivalent to that supplied to sendmsg() in the control data buffer. It is entirely feasible to use this to point to a kernel data structure. If this function is successful, an opaque reference to the RxRPC call is returned. The caller now holds a reference on this and it must be properly ended. (*) End a client call. void rxrpc_kernel_end_call(struct rxrpc_call *call); This is used to end a previously begun call. The user_call_ID is expunged from AF_RXRPC's knowledge and will not be seen again in association with the specified call. (*) Send data through a call. int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, size_t len); This is used to supply either the request part of a client call or the reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the data buffers to be used. msg_iov may not be NULL and must point exclusively to in-kernel virtual addresses. msg.msg_flags may be given MSG_MORE if there will be subsequent data sends for this call. The msg must not specify a destination address, control data or any flags other than MSG_MORE. len is the total amount of data to transmit. (*) Abort a call. void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code); This is used to abort a call if it's still in an abortable state. The abort code specified will be placed in the ABORT message sent. (*) Intercept received RxRPC messages. typedef void (*rxrpc_interceptor_t)(struct sock *sk, unsigned long user_call_ID, struct sk_buff *skb); void rxrpc_kernel_intercept_rx_messages(struct socket *sock, rxrpc_interceptor_t interceptor); This installs an interceptor function on the specified AF_RXRPC socket. All messages that would otherwise wind up in the socket's Rx queue are then diverted to this function. Note that care must be taken to process the messages in the right order to maintain DATA message sequentiality. The interceptor function itself is provided with the address of the socket and handling the incoming message, the ID assigned by the kernel utility to the call and the socket buffer containing the message. The skb->mark field indicates the type of message: MARK MEANING =============================== ======================================= RXRPC_SKB_MARK_DATA Data message RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call RXRPC_SKB_MARK_BUSY Client call rejected as server busy RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer RXRPC_SKB_MARK_NET_ERROR Network error detected RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance The remote abort message can be probed with rxrpc_kernel_get_abort_code(). The two error messages can be probed with rxrpc_kernel_get_error_number(). A new call can be accepted with rxrpc_kernel_accept_call(). Data messages can have their contents extracted with the usual bunch of socket buffer manipulation functions. A data message can be determined to be the last one in a sequence with rxrpc_kernel_is_data_last(). When a data message has been used up, rxrpc_kernel_data_delivered() should be called on it.. Non-data messages should be handled to rxrpc_kernel_free_skb() to dispose of. It is possible to get extra refs on all types of message for later freeing, but this may pin the state of a call until the message is finally freed. (*) Accept an incoming call. struct rxrpc_call * rxrpc_kernel_accept_call(struct socket *sock, unsigned long user_call_ID); This is used to accept an incoming call and to assign it a call ID. This function is similar to rxrpc_kernel_begin_call() and calls accepted must be ended in the same way. If this function is successful, an opaque reference to the RxRPC call is returned. The caller now holds a reference on this and it must be properly ended. (*) Reject an incoming call. int rxrpc_kernel_reject_call(struct socket *sock); This is used to reject the first incoming call on the socket's queue with a BUSY message. -ENODATA is returned if there were no incoming calls. Other errors may be returned if the call had been aborted (-ECONNABORTED) or had timed out (-ETIME). (*) Record the delivery of a data message and free it. void rxrpc_kernel_data_delivered(struct sk_buff *skb); This is used to record a data message as having been delivered and to update the ACK state for the call. The socket buffer will be freed. (*) Free a message. void rxrpc_kernel_free_skb(struct sk_buff *skb); This is used to free a non-DATA socket buffer intercepted from an AF_RXRPC socket. (*) Determine if a data message is the last one on a call. bool rxrpc_kernel_is_data_last(struct sk_buff *skb); This is used to determine if a socket buffer holds the last data message to be received for a call (true will be returned if it does, false if not). The data message will be part of the reply on a client call and the request on an incoming call. In the latter case there will be more messages, but in the former case there will not. (*) Get the abort code from an abort message. u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb); This is used to extract the abort code from a remote abort message. (*) Get the error number from a local or network error message. int rxrpc_kernel_get_error_number(struct sk_buff *skb); This is used to extract the error number from a message indicating either a local error occurred or a network error occurred. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-26 16:50:17 -06:00
extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
/*
* call_accept.c
*/
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
void rxrpc_discard_prealloc(struct rxrpc_sock *);
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
struct rxrpc_connection *,
struct sk_buff *);
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
void rxrpc_accept_incoming_calls(struct rxrpc_local *);
rxrpc: Don't expose skbs to in-kernel users [ver #2] Don't expose skbs to in-kernel users, such as the AFS filesystem, but instead provide a notification hook the indicates that a call needs attention and another that indicates that there's a new call to be collected. This makes the following possibilities more achievable: (1) Call refcounting can be made simpler if skbs don't hold refs to calls. (2) skbs referring to non-data events will be able to be freed much sooner rather than being queued for AFS to pick up as rxrpc_kernel_recv_data will be able to consult the call state. (3) We can shortcut the receive phase when a call is remotely aborted because we don't have to go through all the packets to get to the one cancelling the operation. (4) It makes it easier to do encryption/decryption directly between AFS's buffers and sk_buffs. (5) Encryption/decryption can more easily be done in the AFS's thread contexts - usually that of the userspace process that issued a syscall - rather than in one of rxrpc's background threads on a workqueue. (6) AFS will be able to wait synchronously on a call inside AF_RXRPC. To make this work, the following interface function has been added: int rxrpc_kernel_recv_data( struct socket *sock, struct rxrpc_call *call, void *buffer, size_t bufsize, size_t *_offset, bool want_more, u32 *_abort_code); This is the recvmsg equivalent. It allows the caller to find out about the state of a specific call and to transfer received data into a buffer piecemeal. afs_extract_data() and rxrpc_kernel_recv_data() now do all the extraction logic between them. They don't wait synchronously yet because the socket lock needs to be dealt with. Five interface functions have been removed: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() As a temporary hack, sk_buffs going to an in-kernel call are queued on the rxrpc_call struct (->knlrecv_queue) rather than being handed over to the in-kernel user. To process the queue internally, a temporary function, temp_deliver_data() has been added. This will be replaced with common code between the rxrpc_recvmsg() path and the kernel_rxrpc_recv_data() path in a future patch. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-30 13:42:14 -06:00
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
rxrpc_notify_rx_t);
int rxrpc_reject_call(struct rxrpc_sock *);
/*
* call_event.c
*/
void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
enum rxrpc_propose_ack_trace);
void rxrpc_process_call(struct work_struct *);
/*
* call_object.c
*/
extern const char *const rxrpc_call_states[];
extern const char *const rxrpc_call_completions[];
extern unsigned int rxrpc_max_call_lifetime;
extern struct kmem_cache *rxrpc_call_jar;
struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
rxrpc: Provide a different lockdep key for call->user_mutex for kernel calls [ Upstream commit 9faaff593404a9c4e5abc6839a641635d7b9d0cd ] Provide a different lockdep key for rxrpc_call::user_mutex when the call is made on a kernel socket, such as by the AFS filesystem. The problem is that lockdep registers a false positive between userspace calling the sendmsg syscall on a user socket where call->user_mutex is held whilst userspace memory is accessed whereas the AFS filesystem may perform operations with mmap_sem held by the caller. In such a case, the following warning is produced. ====================================================== WARNING: possible circular locking dependency detected 4.14.0-fscache+ #243 Tainted: G E ------------------------------------------------------ modpost/16701 is trying to acquire lock: (&vnode->io_lock){+.+.}, at: [<ffffffffa000fc40>] afs_begin_vnode_operation+0x33/0x77 [kafs] but task is already holding lock: (&mm->mmap_sem){++++}, at: [<ffffffff8104376a>] __do_page_fault+0x1ef/0x486 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++}: __might_fault+0x61/0x89 _copy_from_iter_full+0x40/0x1fa rxrpc_send_data+0x8dc/0xff3 rxrpc_do_sendmsg+0x62f/0x6a1 rxrpc_sendmsg+0x166/0x1b7 sock_sendmsg+0x2d/0x39 ___sys_sendmsg+0x1ad/0x22b __sys_sendmsg+0x41/0x62 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #2 (&call->user_mutex){+.+.}: __mutex_lock+0x86/0x7d2 rxrpc_new_client_call+0x378/0x80e rxrpc_kernel_begin_call+0xf3/0x154 afs_make_call+0x195/0x454 [kafs] afs_vl_get_capabilities+0x193/0x198 [kafs] afs_vl_lookup_vldb+0x5f/0x151 [kafs] afs_create_volume+0x2e/0x2f4 [kafs] afs_mount+0x56a/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #1 (k-sk_lock-AF_RXRPC){+.+.}: lock_sock_nested+0x74/0x8a rxrpc_kernel_begin_call+0x8a/0x154 afs_make_call+0x195/0x454 [kafs] afs_fs_get_capabilities+0x17a/0x17f [kafs] afs_probe_fileserver+0xf7/0x2f0 [kafs] afs_select_fileserver+0x83f/0x903 [kafs] afs_fetch_status+0x89/0x11d [kafs] afs_iget+0x16f/0x4f8 [kafs] afs_mount+0x6c6/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #0 (&vnode->io_lock){+.+.}: lock_acquire+0x174/0x19f __mutex_lock+0x86/0x7d2 afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 __clear_user+0x3d/0x60 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 other info that might help us debug this: Chain exists of: &vnode->io_lock --> &call->user_mutex --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&call->user_mutex); lock(&mm->mmap_sem); lock(&vnode->io_lock); *** DEADLOCK *** 1 lock held by modpost/16701: #0: (&mm->mmap_sem){++++}, at: [<ffffffff8104376a>] __do_page_fault+0x1ef/0x486 stack backtrace: CPU: 0 PID: 16701 Comm: modpost Tainted: G E 4.14.0-fscache+ #243 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Call Trace: dump_stack+0x67/0x8e print_circular_bug+0x341/0x34f check_prev_add+0x11f/0x5d4 ? add_lock_to_list.isra.12+0x8b/0x8b ? add_lock_to_list.isra.12+0x8b/0x8b ? __lock_acquire+0xf77/0x10b4 __lock_acquire+0xf77/0x10b4 lock_acquire+0x174/0x19f ? afs_begin_vnode_operation+0x33/0x77 [kafs] __mutex_lock+0x86/0x7d2 ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba ? filemap_fault+0x179/0x54d filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 RIP: 0010:__clear_user+0x3d/0x60 RSP: 0018:ffff880071e93da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000000000011c RCX: 000000000000011c RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000060f720 RBP: 000000000060f720 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffff8800b5459b68 R12: ffff8800ce150e00 R13: 000000000060f720 R14: 00000000006127a8 R15: 0000000000000000 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fdb6009ee07 RSP: 002b:00007fff566d9728 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 000055ba57280900 RCX: 00007fdb6009ee07 RDX: 000055ba5727f270 RSI: 000055ba5727cac0 RDI: 000055ba57280900 RBP: 000055ba57280900 R08: 00007fff566d9700 R09: 0000000000000000 R10: 000055ba5727cac0 R11: 0000000000000246 R12: 0000000000000000 R13: 000055ba5727cac0 R14: 000055ba5727f270 R15: 0000000000000000 Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Sasha Levin <alexander.levin@verizon.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2017-11-24 03:18:40 -07:00
struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t);
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
struct rxrpc_conn_parameters *,
struct sockaddr_rxrpc *,
unsigned long, s64, gfp_t);
int rxrpc_retry_client_call(struct rxrpc_sock *,
struct rxrpc_call *,
struct rxrpc_conn_parameters *,
struct sockaddr_rxrpc *,
gfp_t);
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
struct sk_buff *);
rxrpc: Calls shouldn't hold socket refs rxrpc calls shouldn't hold refs on the sock struct. This was done so that the socket wouldn't go away whilst the call was in progress, such that the call could reach the socket's queues. However, we can mark the socket as requiring an RCU release and rely on the RCU read lock. To make this work, we do: (1) rxrpc_release_call() removes the call's call user ID. This is now only called from socket operations and not from the call processor: rxrpc_accept_call() / rxrpc_kernel_accept_call() rxrpc_reject_call() / rxrpc_kernel_reject_call() rxrpc_kernel_end_call() rxrpc_release_calls_on_socket() rxrpc_recvmsg() Though it is also called in the cleanup path of rxrpc_accept_incoming_call() before we assign a user ID. (2) Pass the socket pointer into rxrpc_release_call() rather than getting it from the call so that we can get rid of uninitialised calls. (3) Fix call processor queueing to pass a ref to the work queue and to release that ref at the end of the processor function (or to pass it back to the work queue if we have to requeue). (4) Skip out of the call processor function asap if the call is complete and don't requeue it if the call is complete. (5) Clean up the call immediately that the refcount reaches 0 rather than trying to defer it. Actual deallocation is deferred to RCU, however. (6) Don't hold socket refs for allocated calls. (7) Use the RCU read lock when queueing a message on a socket and treat the call's socket pointer according to RCU rules and check it for NULL. We also need to use the RCU read lock when viewing a call through procfs. (8) Transmit the final ACK/ABORT to a client call in rxrpc_release_call() if this hasn't been done yet so that we can then disconnect the call. Once the call is disconnected, it won't have any access to the connection struct and the UDP socket for the call work processor to be able to send the ACK. Terminal retransmission will be handled by the connection processor. (9) Release all calls immediately on the closing of a socket rather than trying to defer this. Incomplete calls will be aborted. The call refcount model is much simplified. Refs are held on the call by: (1) A socket's user ID tree. (2) A socket's incoming call secureq and acceptq. (3) A kernel service that has a call in progress. (4) A queued call work processor. We have to take care to put any call that we failed to queue. (5) sk_buffs on a socket's receive queue. A future patch will get rid of this. Whilst we're at it, we can do: (1) Get rid of the RXRPC_CALL_EV_RELEASE event. Release is now done entirely from the socket routines and never from the call's processor. (2) Get rid of the RXRPC_CALL_DEAD state. Calls now end in the RXRPC_CALL_COMPLETE state. (3) Get rid of the rxrpc_call::destroyer work item. Calls are now torn down when their refcount reaches 0 and then handed over to RCU for final cleanup. (4) Get rid of the rxrpc_call::deadspan timer. Calls are cleaned up immediately they're finished with and don't hang around. Post-completion retransmission is handled by the connection processor once the call is disconnected. (5) Get rid of the dead call expiry setting as there's no longer a timer to set. (6) rxrpc_destroy_all_calls() can just check that the call list is empty. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-07 02:19:31 -06:00
void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
int rxrpc_prepare_call_for_retry(struct rxrpc_sock *, struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
rxrpc: Calls shouldn't hold socket refs rxrpc calls shouldn't hold refs on the sock struct. This was done so that the socket wouldn't go away whilst the call was in progress, such that the call could reach the socket's queues. However, we can mark the socket as requiring an RCU release and rely on the RCU read lock. To make this work, we do: (1) rxrpc_release_call() removes the call's call user ID. This is now only called from socket operations and not from the call processor: rxrpc_accept_call() / rxrpc_kernel_accept_call() rxrpc_reject_call() / rxrpc_kernel_reject_call() rxrpc_kernel_end_call() rxrpc_release_calls_on_socket() rxrpc_recvmsg() Though it is also called in the cleanup path of rxrpc_accept_incoming_call() before we assign a user ID. (2) Pass the socket pointer into rxrpc_release_call() rather than getting it from the call so that we can get rid of uninitialised calls. (3) Fix call processor queueing to pass a ref to the work queue and to release that ref at the end of the processor function (or to pass it back to the work queue if we have to requeue). (4) Skip out of the call processor function asap if the call is complete and don't requeue it if the call is complete. (5) Clean up the call immediately that the refcount reaches 0 rather than trying to defer it. Actual deallocation is deferred to RCU, however. (6) Don't hold socket refs for allocated calls. (7) Use the RCU read lock when queueing a message on a socket and treat the call's socket pointer according to RCU rules and check it for NULL. We also need to use the RCU read lock when viewing a call through procfs. (8) Transmit the final ACK/ABORT to a client call in rxrpc_release_call() if this hasn't been done yet so that we can then disconnect the call. Once the call is disconnected, it won't have any access to the connection struct and the UDP socket for the call work processor to be able to send the ACK. Terminal retransmission will be handled by the connection processor. (9) Release all calls immediately on the closing of a socket rather than trying to defer this. Incomplete calls will be aborted. The call refcount model is much simplified. Refs are held on the call by: (1) A socket's user ID tree. (2) A socket's incoming call secureq and acceptq. (3) A kernel service that has a call in progress. (4) A queued call work processor. We have to take care to put any call that we failed to queue. (5) sk_buffs on a socket's receive queue. A future patch will get rid of this. Whilst we're at it, we can do: (1) Get rid of the RXRPC_CALL_EV_RELEASE event. Release is now done entirely from the socket routines and never from the call's processor. (2) Get rid of the RXRPC_CALL_DEAD state. Calls now end in the RXRPC_CALL_COMPLETE state. (3) Get rid of the rxrpc_call::destroyer work item. Calls are now torn down when their refcount reaches 0 and then handed over to RCU for final cleanup. (4) Get rid of the rxrpc_call::deadspan timer. Calls are cleaned up immediately they're finished with and don't hang around. Post-completion retransmission is handled by the connection processor once the call is disconnected. (5) Get rid of the dead call expiry setting as there's no longer a timer to set. (6) rxrpc_destroy_all_calls() can just check that the call list is empty. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-07 02:19:31 -06:00
bool __rxrpc_queue_call(struct rxrpc_call *);
bool rxrpc_queue_call(struct rxrpc_call *);
void rxrpc_see_call(struct rxrpc_call *);
void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
void rxrpc_cleanup_call(struct rxrpc_call *);
void rxrpc_destroy_all_calls(struct rxrpc_net *);
static inline bool rxrpc_is_service_call(const struct rxrpc_call *call)
{
return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags);
}
static inline bool rxrpc_is_client_call(const struct rxrpc_call *call)
{
return !rxrpc_is_service_call(call);
}
/*
* Transition a call to the complete state.
*/
static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
enum rxrpc_call_completion compl,
u32 abort_code,
int error)
{
if (call->state < RXRPC_CALL_COMPLETE) {
call->abort_code = abort_code;
call->error = error;
call->completion = compl,
call->state = RXRPC_CALL_COMPLETE;
wake_up(&call->waitq);
return true;
}
return false;
}
static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
enum rxrpc_call_completion compl,
u32 abort_code,
int error)
{
bool ret;
write_lock_bh(&call->state_lock);
ret = __rxrpc_set_call_completion(call, compl, abort_code, error);
write_unlock_bh(&call->state_lock);
return ret;
}
/*
* Record that a call successfully completed.
*/
static inline bool __rxrpc_call_completed(struct rxrpc_call *call)
{
return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
}
static inline bool rxrpc_call_completed(struct rxrpc_call *call)
{
bool ret;
write_lock_bh(&call->state_lock);
ret = __rxrpc_call_completed(call);
write_unlock_bh(&call->state_lock);
return ret;
}
/*
* Record that a call is locally aborted.
*/
static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
rxrpc_seq_t seq,
u32 abort_code, int error)
{
trace_rxrpc_abort(why, call->cid, call->call_id, seq,
abort_code, error);
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
abort_code, error);
}
static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
rxrpc_seq_t seq, u32 abort_code, int error)
{
bool ret;
write_lock_bh(&call->state_lock);
ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
write_unlock_bh(&call->state_lock);
return ret;
}
/*
* Abort a call due to a protocol error.
*/
static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call,
struct sk_buff *skb,
const char *eproto_why,
const char *why,
u32 abort_code)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why);
return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO);
}
#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \
__rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \
(abort_why), (abort_code))
2016-04-04 07:00:37 -06:00
/*
* conn_client.c
*/
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
extern unsigned int rxrpc_max_client_connections;
extern unsigned int rxrpc_reap_client_connections;
extern unsigned int rxrpc_conn_idle_client_expiry;
extern unsigned int rxrpc_conn_idle_client_fast_expiry;
2016-04-04 07:00:37 -06:00
extern struct idr rxrpc_client_conn_ids;
void rxrpc_destroy_client_conn_ids(void);
int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
struct sockaddr_rxrpc *, gfp_t);
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
void rxrpc_expose_client_call(struct rxrpc_call *);
void rxrpc_disconnect_client_call(struct rxrpc_call *);
void rxrpc_put_client_conn(struct rxrpc_connection *);
void rxrpc_discard_expired_client_conns(struct work_struct *);
void rxrpc_destroy_all_client_connections(struct rxrpc_net *);
2016-04-04 07:00:37 -06:00
/*
* conn_event.c
*/
void rxrpc_process_connection(struct work_struct *);
/*
* conn_object.c
*/
extern unsigned int rxrpc_connection_expiry;
extern unsigned int rxrpc_closed_conn_expiry;
struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
struct sk_buff *);
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
void rxrpc_disconnect_call(struct rxrpc_call *);
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
void rxrpc_kill_connection(struct rxrpc_connection *);
bool rxrpc_queue_conn(struct rxrpc_connection *);
void rxrpc_see_connection(struct rxrpc_connection *);
void rxrpc_get_connection(struct rxrpc_connection *);
struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
void rxrpc_put_service_conn(struct rxrpc_connection *);
void rxrpc_service_connection_reaper(struct work_struct *);
void rxrpc_destroy_all_connections(struct rxrpc_net *);
static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
{
return conn->out_clientflag;
}
static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
{
return !rxrpc_conn_is_client(conn);
}
static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
{
rxrpc: Improve management and caching of client connection objects Improve the management and caching of client rxrpc connection objects. From this point, client connections will be managed separately from service connections because AF_RXRPC controls the creation and re-use of client connections but doesn't have that luxury with service connections. Further, there will be limits on the numbers of client connections that may be live on a machine. No direct restriction will be placed on the number of client calls, excepting that each client connection can support a maximum of four concurrent calls. Note that, for a number of reasons, we don't want to simply discard a client connection as soon as the last call is apparently finished: (1) Security is negotiated per-connection and the context is then shared between all calls on that connection. The context can be negotiated again if the connection lapses, but that involves holding up calls whilst at least two packets are exchanged and various crypto bits are performed - so we'd ideally like to cache it for a little while at least. (2) If a packet goes astray, we will need to retransmit a final ACK or ABORT packet. To make this work, we need to keep around the connection details for a little while. (3) The locally held structures represent some amount of setup time, to be weighed against their occupation of memory when idle. To this end, the client connection cache is managed by a state machine on each connection. There are five states: (1) INACTIVE - The connection is not held in any list and may not have been exposed to the world. If it has been previously exposed, it was discarded from the idle list after expiring. (2) WAITING - The connection is waiting for the number of client conns to drop below the maximum capacity. Calls may be in progress upon it from when it was active and got culled. The connection is on the rxrpc_waiting_client_conns list which is kept in to-be-granted order. Culled conns with waiters go to the back of the queue just like new conns. (3) ACTIVE - The connection has at least one call in progress upon it, it may freely grant available channels to new calls and calls may be waiting on it for channels to become available. The connection is on the rxrpc_active_client_conns list which is kept in activation order for culling purposes. (4) CULLED - The connection got summarily culled to try and free up capacity. Calls currently in progress on the connection are allowed to continue, but new calls will have to wait. There can be no waiters in this state - the conn would have to go to the WAITING state instead. (5) IDLE - The connection has no calls in progress upon it and must have been exposed to the world (ie. the EXPOSED flag must be set). When it expires, the EXPOSED flag is cleared and the connection transitions to the INACTIVE state. The connection is on the rxrpc_idle_client_conns list which is kept in order of how soon they'll expire. A connection in the ACTIVE or CULLED state must have at least one active call upon it; if in the WAITING state it may have active calls upon it; other states may not have active calls. As long as a connection remains active and doesn't get culled, it may continue to process calls - even if there are connections on the wait queue. This simplifies things a bit and reduces the amount of checking we need do. There are a couple flags of relevance to the cache: (1) EXPOSED - The connection ID got exposed to the world. If this flag is set, an extra ref is added to the connection preventing it from being reaped when it has no calls outstanding. This flag is cleared and the ref dropped when a conn is discarded from the idle list. (2) DONT_REUSE - The connection should be discarded as soon as possible and should not be reused. This commit also provides a number of new settings: (*) /proc/net/rxrpc/max_client_conns The maximum number of live client connections. Above this number, new connections get added to the wait list and must wait for an active conn to be culled. Culled connections can be reused, but they will go to the back of the wait list and have to wait. (*) /proc/net/rxrpc/reap_client_conns If the number of desired connections exceeds the maximum above, the active connection list will be culled until there are only this many left in it. (*) /proc/net/rxrpc/idle_conn_expiry The normal expiry time for a client connection, provided there are fewer than reap_client_conns of them around. (*) /proc/net/rxrpc/idle_conn_fast_expiry The expedited expiry time, used when there are more than reap_client_conns of them around. Note that I combined the Tx wait queue with the channel grant wait queue to save space as only one of these should be in use at once. Note also that, for the moment, the service connection cache still uses the old connection management code. Signed-off-by: David Howells <dhowells@redhat.com>
2016-08-24 00:30:52 -06:00
if (!conn)
return;
if (rxrpc_conn_is_client(conn))
rxrpc_put_client_conn(conn);
else
rxrpc_put_service_conn(conn);
}
/*
* conn_service.c
*/
struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
struct sk_buff *);
struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t);
rxrpc: Implement service upgrade Implement AuriStor's service upgrade facility. There are three problems that this is meant to deal with: (1) Various of the standard AFS RPC calls have IPv4 addresses in their requests and/or replies - but there's no room for including IPv6 addresses. (2) Definition of IPv6-specific RPC operations in the standard operation sets has not yet been achieved. (3) One could envision the creation a new service on the same port that as the original service. The new service could implement improved operations - and the client could try this first, falling back to the original service if it's not there. Unfortunately, certain servers ignore packets addressed to a service they don't implement and don't respond in any way - not even with an ABORT. This means that the client must then wait for the call timeout to occur. What service upgrade does is to see if the connection is marked as being 'upgradeable' and if so, change the service ID in the server and thus the request and reply formats. Note that the upgrade isn't mandatory - a server that supports only the original call set will ignore the upgrade request. In the protocol, the procedure is then as follows: (1) To request an upgrade, the first DATA packet in a new connection must have the userStatus set to 1 (this is normally 0). The userStatus value is normally ignored by the server. (2) If the server doesn't support upgrading, the reply packets will contain the same service ID as for the first request packet. (3) If the server does support upgrading, all future reply packets on that connection will contain the new service ID and the new service ID will be applied to *all* further calls on that connection as well. (4) The RPC op used to probe the upgrade must take the same request data as the shadow call in the upgrade set (but may return a different reply). GetCapability RPC ops were added to all standard sets for just this purpose. Ops where the request formats differ cannot be used for probing. (5) The client must wait for completion of the probe before sending any further RPC ops to the same destination. It should then use the service ID that recvmsg() reported back in all future calls. (6) The shadow service must have call definitions for all the operation IDs defined by the original service. To support service upgrading, a server should: (1) Call bind() twice on its AF_RXRPC socket before calling listen(). Each bind() should supply a different service ID, but the transport addresses must be the same. This allows the server to receive requests with either service ID. (2) Enable automatic upgrading by calling setsockopt(), specifying RXRPC_UPGRADEABLE_SERVICE and passing in a two-member array of unsigned shorts as the argument: unsigned short optval[2]; This specifies a pair of service IDs. They must be different and must match the service IDs bound to the socket. Member 0 is the service ID to upgrade from and member 1 is the service ID to upgrade to. Signed-off-by: David Howells <dhowells@redhat.com>
2017-06-05 07:30:49 -06:00
void rxrpc_new_incoming_connection(struct rxrpc_sock *,
struct rxrpc_connection *, struct sk_buff *);
rxrpc: Maintain an extra ref on a conn for the cache list Overhaul the usage count accounting for the rxrpc_connection struct to make it easier to implement RCU access from the data_ready handler. The problem is that currently we're using a lock to prevent the garbage collector from trying to clean up a connection that we're contemplating unidling. We could just stick incoming packets on the connection we find, but we've then got a problem that we may race when dispatching a work item to process it as we need to give that a ref to prevent the rxrpc_connection struct from disappearing in the meantime. Further, incoming packets may get discarded if attached to an rxrpc_connection struct that is going away. Whilst this is not a total disaster - the client will presumably resend - it would delay processing of the call. This would affect the AFS client filesystem's service manager operation. To this end: (1) We now maintain an extra count on the connection usage count whilst it is on the connection list. This mean it is not in use when its refcount is 1. (2) When trying to reuse an old connection, we only increment the refcount if it is greater than 0. If it is 0, we replace it in the tree with a new candidate connection. (3) Two connection flags are added to indicate whether or not a connection is in the local's client connection tree (used by sendmsg) or the peer's service connection tree (used by data_ready). This makes sure that we don't try and remove a connection if it got replaced. The flags are tested under lock with the removal operation to prevent the reaper from killing the rxrpc_connection struct whilst someone else is trying to effect a replacement. This could probably be alleviated by using memory barriers between the flag set/test and the rb_tree ops. The rb_tree op would still need to be under the lock, however. (4) When trying to reap an old connection, we try to flip the usage count from 1 to 0. If it's not 1 at that point, then it must've come back to life temporarily and we ignore it. Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-30 03:45:22 -06:00
void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
* input.c
*/
void rxrpc_data_ready(struct sock *);
/*
* insecure.c
*/
extern const struct rxrpc_security rxrpc_no_security;
/*
* key.c
*/
extern struct key_type key_type_rxrpc;
extern struct key_type key_type_rxrpc_s;
int rxrpc_request_key(struct rxrpc_sock *, char __user *, int);
int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int);
int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t,
u32);
/*
* local_event.c
*/
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
extern void rxrpc_process_local_events(struct rxrpc_local *);
/*
* local_object.c
*/
struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *);
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
void __rxrpc_put_local(struct rxrpc_local *);
void rxrpc_destroy_all_locals(struct rxrpc_net *);
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
static inline void rxrpc_get_local(struct rxrpc_local *local)
{
atomic_inc(&local->usage);
}
static inline
struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
{
return atomic_inc_not_zero(&local->usage) ? local : NULL;
}
static inline void rxrpc_put_local(struct rxrpc_local *local)
{
if (local && atomic_dec_and_test(&local->usage))
rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells <dhowells@redhat.com>
2016-04-04 07:00:35 -06:00
__rxrpc_put_local(local);
}
static inline void rxrpc_queue_local(struct rxrpc_local *local)
{
rxrpc_queue_work(&local->processor);
}
/*
* misc.c
*/
extern unsigned int rxrpc_max_backlog __read_mostly;
extern unsigned int rxrpc_requested_ack_delay;
extern unsigned int rxrpc_soft_ack_delay;
extern unsigned int rxrpc_idle_ack_delay;
extern unsigned int rxrpc_rx_window_size;
extern unsigned int rxrpc_rx_mtu;
extern unsigned int rxrpc_rx_jumbo_max;
extern unsigned int rxrpc_resend_timeout;
extern const s8 rxrpc_ack_priority[];
/*
* net_ns.c
*/
extern unsigned int rxrpc_net_id;
extern struct pernet_operations rxrpc_net_ops;
static inline struct rxrpc_net *rxrpc_net(struct net *net)
{
return net_generic(net, rxrpc_net_id);
}
/*
* output.c
*/
int rxrpc_send_ack_packet(struct rxrpc_call *, bool);
int rxrpc_send_abort_packet(struct rxrpc_call *);
int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
void rxrpc_reject_packets(struct rxrpc_local *);
/*
* peer_event.c
*/
void rxrpc_error_report(struct sock *);
void rxrpc_peer_error_distributor(struct work_struct *);
void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
/*
* peer_object.c
*/
struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
const struct sockaddr_rxrpc *);
struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
struct sockaddr_rxrpc *, gfp_t);
struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
struct rxrpc_peer *);
static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
{
atomic_inc(&peer->usage);
return peer;
}
static inline
struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
{
return atomic_inc_not_zero(&peer->usage) ? peer : NULL;
}
extern void __rxrpc_put_peer(struct rxrpc_peer *peer);
static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
{
if (peer && atomic_dec_and_test(&peer->usage))
__rxrpc_put_peer(peer);
}
/*
* proc.c
*/
extern const struct file_operations rxrpc_call_seq_fops;
extern const struct file_operations rxrpc_connection_seq_fops;
/*
* recvmsg.c
*/
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
void rxrpc_notify_socket(struct rxrpc_call *);
int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
/*
* rxkad.c
*/
#ifdef CONFIG_RXKAD
extern const struct rxrpc_security rxkad;
#endif
/*
* security.c
*/
int __init rxrpc_init_security(void);
void rxrpc_exit_security(void);
int rxrpc_init_client_conn_security(struct rxrpc_connection *);
int rxrpc_init_server_conn_security(struct rxrpc_connection *);
/*
* sendmsg.c
*/
int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
/*
* skbuff.c
*/
rxrpc: Don't expose skbs to in-kernel users [ver #2] Don't expose skbs to in-kernel users, such as the AFS filesystem, but instead provide a notification hook the indicates that a call needs attention and another that indicates that there's a new call to be collected. This makes the following possibilities more achievable: (1) Call refcounting can be made simpler if skbs don't hold refs to calls. (2) skbs referring to non-data events will be able to be freed much sooner rather than being queued for AFS to pick up as rxrpc_kernel_recv_data will be able to consult the call state. (3) We can shortcut the receive phase when a call is remotely aborted because we don't have to go through all the packets to get to the one cancelling the operation. (4) It makes it easier to do encryption/decryption directly between AFS's buffers and sk_buffs. (5) Encryption/decryption can more easily be done in the AFS's thread contexts - usually that of the userspace process that issued a syscall - rather than in one of rxrpc's background threads on a workqueue. (6) AFS will be able to wait synchronously on a call inside AF_RXRPC. To make this work, the following interface function has been added: int rxrpc_kernel_recv_data( struct socket *sock, struct rxrpc_call *call, void *buffer, size_t bufsize, size_t *_offset, bool want_more, u32 *_abort_code); This is the recvmsg equivalent. It allows the caller to find out about the state of a specific call and to transfer received data into a buffer piecemeal. afs_extract_data() and rxrpc_kernel_recv_data() now do all the extraction logic between them. They don't wait synchronously yet because the socket lock needs to be dealt with. Five interface functions have been removed: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() As a temporary hack, sk_buffs going to an in-kernel call are queued on the rxrpc_call struct (->knlrecv_queue) rather than being handed over to the in-kernel user. To process the queue internally, a temporary function, temp_deliver_data() has been added. This will be replaced with common code between the rxrpc_recvmsg() path and the kernel_rxrpc_recv_data() path in a future patch. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-30 13:42:14 -06:00
void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
void rxrpc_packet_destructor(struct sk_buff *);
void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
/*
* sysctl.c
*/
#ifdef CONFIG_SYSCTL
extern int __init rxrpc_sysctl_init(void);
extern void rxrpc_sysctl_exit(void);
#else
static inline int __init rxrpc_sysctl_init(void) { return 0; }
static inline void rxrpc_sysctl_exit(void) {}
#endif
/*
* utils.c
*/
int rxrpc_extract_addr_from_skb(struct rxrpc_local *, struct sockaddr_rxrpc *,
struct sk_buff *);
rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 04:10:12 -06:00
static inline bool before(u32 seq1, u32 seq2)
{
return (s32)(seq1 - seq2) < 0;
}
static inline bool before_eq(u32 seq1, u32 seq2)
{
return (s32)(seq1 - seq2) <= 0;
}
static inline bool after(u32 seq1, u32 seq2)
{
return (s32)(seq1 - seq2) > 0;
}
static inline bool after_eq(u32 seq1, u32 seq2)
{
return (s32)(seq1 - seq2) >= 0;
}
/*
* debug tracing
*/
extern unsigned int rxrpc_debug;
#define dbgprintk(FMT,...) \
printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
#define kproto(FMT,...) dbgprintk("### "FMT ,##__VA_ARGS__)
#define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__)
#if defined(__KDEBUG)
#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__)
#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__)
#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__)
#define _proto(FMT,...) kproto(FMT,##__VA_ARGS__)
#define _net(FMT,...) knet(FMT,##__VA_ARGS__)
#elif defined(CONFIG_AF_RXRPC_DEBUG)
#define RXRPC_DEBUG_KENTER 0x01
#define RXRPC_DEBUG_KLEAVE 0x02
#define RXRPC_DEBUG_KDEBUG 0x04
#define RXRPC_DEBUG_KPROTO 0x08
#define RXRPC_DEBUG_KNET 0x10
#define _enter(FMT,...) \
do { \
if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \
kenter(FMT,##__VA_ARGS__); \
} while (0)
#define _leave(FMT,...) \
do { \
if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \
kleave(FMT,##__VA_ARGS__); \
} while (0)
#define _debug(FMT,...) \
do { \
if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \
kdebug(FMT,##__VA_ARGS__); \
} while (0)
#define _proto(FMT,...) \
do { \
if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \
kproto(FMT,##__VA_ARGS__); \
} while (0)
#define _net(FMT,...) \
do { \
if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \
knet(FMT,##__VA_ARGS__); \
} while (0)
#else
#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
#define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__)
#define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__)
#endif
/*
* debug assertion checking
*/
#if 1 // defined(__KDEBUGALL)
#define ASSERT(X) \
do { \
if (unlikely(!(X))) { \
pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
#define ASSERTCMP(X, OP, Y) \
do { \
__typeof__(X) _x = (X); \
__typeof__(Y) _y = (__typeof__(X))(Y); \
if (unlikely(!(_x OP _y))) { \
pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
(unsigned long)_x, (unsigned long)_x, #OP, \
(unsigned long)_y, (unsigned long)_y); \
BUG(); \
} \
} while (0)
#define ASSERTIF(C, X) \
do { \
if (unlikely((C) && !(X))) { \
pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
__typeof__(X) _x = (X); \
__typeof__(Y) _y = (__typeof__(X))(Y); \
if (unlikely((C) && !(_x OP _y))) { \
pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
(unsigned long)_x, (unsigned long)_x, #OP, \
(unsigned long)_y, (unsigned long)_y); \
BUG(); \
} \
} while (0)
#else
#define ASSERT(X) \
do { \
} while (0)
#define ASSERTCMP(X, OP, Y) \
do { \
} while (0)
#define ASSERTIF(C, X) \
do { \
} while (0)
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
} while (0)
#endif /* __KDEBUGALL */