From edc1b01cd3b20a5fff049e98f82a2b0d24a34c89 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Oct 2015 10:53:49 -0400 Subject: SUNRPC: Move TCP receive data path into a workqueue context Stream protocols such as TCP can often build up a backlog of data to be read due to ordering. Combine this with the fact that some workloads such as NFS read()-intensive workloads need to receive a lot of data per RPC call, and it turns out that receiving the data from inside a softirq context can cause starvation. The following patch moves the TCP data receive into a workqueue context. We still end up calling tcp_read_sock(), but we do so from a process context, meaning that softirqs are enabled for most of the time. With this patch, I see a doubling of read bandwidth when running a multi-threaded iozone workload between a virtual client and server setup. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 357e44c1a46b..0ece4ba06f06 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -44,6 +44,8 @@ struct sock_xprt { */ unsigned long sock_state; struct delayed_work connect_worker; + struct work_struct recv_worker; + struct mutex recv_mutex; struct sockaddr_storage srcaddr; unsigned short srcport; -- cgit v1.2.3 From 778620364ef525e83597a6edee4d0a69db67fd3d Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Fri, 16 Oct 2015 08:59:08 +1100 Subject: sunrpc/cache: make cache flushing more reliable. The caches used to store sunrpc authentication information can be flushed by writing a timestamp to a file in /proc. This timestamp has a one-second resolution and any entry in cache that was last_refreshed *before* that time is treated as expired. This is problematic as it is not possible to reliably flush the cache without interrupting NFS service. If the current time is written to the "flush" file, any entry that was added since the current second started will still be treated as valid. If one second beyond than the current time is written to the file then no entries can be valid until the second ticks over. This will mean that no NFS request will be handled for up to 1 second. To resolve this issue we make two changes: 1/ treat an entry as expired if the timestamp when it was last_refreshed is before *or the same as* the expiry time. This means that current code which writes out the current time will now flush the cache reliably. 2/ when a new entry in added to the cache - set the last_refresh timestamp to 1 second *beyond* the current flush time, when that not in the past. This ensures that newly added entries will always be valid. Now that we have a very reliable way to flush the cache, and also since we are using "since-boot" timestamps which are monotonic, change cache_purge() to set the smallest future flush_time which will work, and leave it there: don't revert to '1'. Also disable the setting of the 'flush_time' far into the future. That has never been useful and is now awkward as it would cause last_refresh times to be strange. Finally: if a request is made to set the 'flush_time' to the current second, assume the intent is to flush the cache and advance it, if necessary, to 1 second beyond the current 'flush_time' so that all active entries will be deemed to be expired. As part of this we need to add a 'cache_detail' arg to cache_init() and cache_fresh_locked() so they can find the current ->flush_time. Signed-off-by: NeilBrown Reported-by: Olaf Kirch Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 03d3b4c92d9f..ed03c9f7f908 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -48,8 +48,10 @@ struct cache_head { struct hlist_node cache_list; time_t expiry_time; /* After time time, don't use the data */ - time_t last_refresh; /* If CACHE_PENDING, this is when upcall - * was sent, else this is when update was received + time_t last_refresh; /* If CACHE_PENDING, this is when upcall was + * sent, else this is when update was + * received, though it is alway set to + * be *after* ->flush_time. */ struct kref ref; unsigned long flags; @@ -105,8 +107,12 @@ struct cache_detail { /* fields below this comment are for internal use * and should not be touched by cache owners */ - time_t flush_time; /* flush all cache items with last_refresh - * earlier than this */ + time_t flush_time; /* flush all cache items with + * last_refresh at or earlier + * than this. last_refresh + * is never set at or earlier + * than this. + */ struct list_head others; time_t nextcheck; int entries; @@ -203,7 +209,7 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd) static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) { return (h->expiry_time < seconds_since_boot()) || - (detail->flush_time > h->last_refresh); + (detail->flush_time >= h->last_refresh); } extern int cache_check(struct cache_detail *detail, -- cgit v1.2.3 From 412a15c0fe537c59c794d4e8134580b9cb984a0c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 13 Oct 2015 19:11:36 +0300 Subject: svcrdma: Port to new memory registration API Instead of maintaining a fastreg page list, keep an sg table and convert an array of pages to a sg list. Then call ib_map_mr_sg and construct ib_reg_wr. Signed-off-by: Sagi Grimberg Acked-by: Christoph Hellwig Tested-by: Steve Wise Tested-by: Selvin Xavier Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 7ccc961f33e9..1e4438ea2380 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -105,11 +105,9 @@ struct svc_rdma_chunk_sge { }; struct svc_rdma_fastreg_mr { struct ib_mr *mr; - void *kva; - struct ib_fast_reg_page_list *page_list; - int page_list_len; + struct scatterlist *sg; + int sg_nents; unsigned long access_flags; - unsigned long map_len; enum dma_data_direction direction; struct list_head frmr_list; }; -- cgit v1.2.3 From 42e5c3e2725ba0c0affc1fc8a6aa1d5cf31ecb75 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 24 Oct 2015 17:27:35 -0400 Subject: SUNRPC: Abstract backchannel operations xprt_{setup,destroy}_backchannel() won't be adequate for RPC/RMDA bi-direction. In particular, receive buffers have to be pre- registered and posted in order to receive incoming backchannel requests. Add a virtual function call to allow the insertion of appropriate backchannel setup and destruction methods for each transport. In addition, freeing a backchannel request is a little different for RPC/RDMA. Introduce an rpc_xprt_op to handle the difference. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-By: Devesh Sharma Signed-off-by: Anna Schumaker --- include/linux/sunrpc/bc_xprt.h | 5 +++++ include/linux/sunrpc/xprt.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index 8df43c9f11dc..4397a4824c81 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -38,6 +38,11 @@ void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); +/* Socket backchannel transport methods */ +int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs); +void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs); +void xprt_free_bc_rqst(struct rpc_rqst *req); + /* * Determine if a shared backchannel is in use */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 0fb9acbb4780..3f79c4a4ce74 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -136,6 +136,11 @@ struct rpc_xprt_ops { int (*enable_swap)(struct rpc_xprt *xprt); void (*disable_swap)(struct rpc_xprt *xprt); void (*inject_disconnect)(struct rpc_xprt *xprt); + int (*bc_setup)(struct rpc_xprt *xprt, + unsigned int min_reqs); + void (*bc_free_rqst)(struct rpc_rqst *rqst); + void (*bc_destroy)(struct rpc_xprt *xprt, + unsigned int max_reqs); }; /* -- cgit v1.2.3 From 9468431962616c2449d47c482208a5967e011bf9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 24 Oct 2015 17:28:16 -0400 Subject: svcrdma: Add backward direction service for RPC/RDMA transport On NFSv4.1 mount points, the Linux NFS client uses this transport endpoint to receive backward direction calls and route replies back to the NFSv4.1 server. Signed-off-by: Chuck Lever Acked-by: "J. Bruce Fields" Reviewed-by: Sagi Grimberg Tested-By: Devesh Sharma Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc_rdma.h | 6 +++++- include/linux/sunrpc/xprt.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 7ccc961f33e9..fb4013edcf57 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -228,9 +228,13 @@ extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); extern void svc_sq_reap(struct svcxprt_rdma *); extern void svc_rq_reap(struct svcxprt_rdma *); -extern struct svc_xprt_class svc_rdma_class; extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); +extern struct svc_xprt_class svc_rdma_class; +#ifdef CONFIG_SUNRPC_BACKCHANNEL +extern struct svc_xprt_class svc_rdma_bc_class; +#endif + /* svc_rdma.c */ extern int svc_rdma_init(void); extern void svc_rdma_cleanup(void); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 3f79c4a4ce74..82c083946ef0 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -158,6 +158,7 @@ enum xprt_transports { XPRT_TRANSPORT_TCP = IPPROTO_TCP, XPRT_TRANSPORT_BC_TCP = IPPROTO_TCP | XPRT_TRANSPORT_BC, XPRT_TRANSPORT_RDMA = 256, + XPRT_TRANSPORT_BC_RDMA = XPRT_TRANSPORT_RDMA | XPRT_TRANSPORT_BC, XPRT_TRANSPORT_LOCAL = 257, }; -- cgit v1.2.3 From 76566773a1f1c2295ed901b6f1241cfe10d99029 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 24 Oct 2015 17:28:32 -0400 Subject: NFS: Enable client side NFSv4.1 backchannel to use other transports Forechannel transports get their own "bc_up" method to create an endpoint for the backchannel service. Signed-off-by: Chuck Lever [Anna Schumaker: Add forward declaration of struct net to xprt.h] Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 82c083946ef0..69ef5b3ab038 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -54,6 +54,8 @@ enum rpc_display_format_t { struct rpc_task; struct rpc_xprt; struct seq_file; +struct svc_serv; +struct net; /* * This describes a complete RPC request @@ -138,6 +140,7 @@ struct rpc_xprt_ops { void (*inject_disconnect)(struct rpc_xprt *xprt); int (*bc_setup)(struct rpc_xprt *xprt, unsigned int min_reqs); + int (*bc_up)(struct svc_serv *serv, struct net *net); void (*bc_free_rqst)(struct rpc_rqst *rqst); void (*bc_destroy)(struct rpc_xprt *xprt, unsigned int max_reqs); -- cgit v1.2.3