From f6763c29ab86c3ee27760a06e07bbeab47635b61 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:05:54 -0500 Subject: svcrdma: Do not send Write chunk XDR pad with inline content The NFS server's XDR encoders adds an XDR pad for content in the xdr_buf page list at the beginning of the xdr_buf's tail buffer. On RDMA transports, Write chunks are sent separately and without an XDR pad. If a Write chunk is being sent, strip off the pad in the tail buffer so that inline content following the Write chunk remains XDR-aligned when it is sent to the client. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=294 Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 5322fea6fe4c..40b678584041 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -224,7 +224,7 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, /* svc_rdma_sendto.c */ extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, - struct svc_rdma_req_map *); + struct svc_rdma_req_map *, bool); extern int svc_rdma_sendto(struct svc_rqst *); extern struct rpcrdma_read_chunk * svc_rdma_get_read_chunk(struct rpcrdma_msg *); -- cgit v1.2.3 From 4500632f60fa0d85e4101c374898cdf9b7b0cfac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:06:02 -0500 Subject: nfsd: Lower NFSv4.1 callback message size limit The maximum size of a backchannel message on RPC-over-RDMA depends on the connection's inline threshold. Today that threshold is typically 1024 bytes, making the maximum message size 996 bytes. The Linux server's CREATE_SESSION operation checks that the size of callback Calls can be as large as 1044 bytes, to accommodate RPCSEC_GSS. Thus CREATE_SESSION fails if a client advertises the true message size maximum of 996 bytes. But the server's backchannel currently does not support RPCSEC_GSS. The actual maximum size it needs is much smaller. It is safe to reduce the limit to enable NFSv4.1 on RDMA backchannel operation. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/auth.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 1ecf13e148b8..6a241a277249 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -20,11 +20,18 @@ #include #include +/* + * Maximum size of AUTH_NONE authentication information, in XDR words. + */ +#define NUL_CALLSLACK (4) +#define NUL_REPLYSLACK (2) + /* * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes, * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes. */ #define UNX_MAXNODENAME __NEW_UTS_LEN +#define UNX_CALLSLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) struct rpcsec_gss_info; -- cgit v1.2.3 From bf36387ad394ad4fc93ad85fdd4a95dfa583556a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:06:20 -0500 Subject: svcrdma: svc_rdma_post_recv() should close connection on error Clean up: Most svc_rdma_post_recv() call sites close the transport connection when a receive cannot be posted. Wrap that in a common helper. Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Tested-by: Devesh Sharma Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 40b678584041..aef47dd2bd1a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -234,6 +234,7 @@ extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode); extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); +extern int svc_rdma_repost_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); -- cgit v1.2.3 From c6db03ea577846a72dc80638f4a70b392c21962f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:06:29 -0500 Subject: rpcrdma: Add RPCRDMA_HDRLEN_ERR Error headers are shorter than either RDMA_MSG or RDMA_NOMSG. Since HDRLEN_MIN is already used in several other places that would be annoying to change, add RPCRDMA_HDRLEN_ERR for the one or two spots where the shorter length is needed. Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/rpc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index f33c5a4d6fe4..8c6d23cb0cae 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -102,6 +102,7 @@ struct rpcrdma_msg { * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks */ #define RPCRDMA_HDRLEN_MIN (sizeof(__be32) * 7) +#define RPCRDMA_HDRLEN_ERR (sizeof(__be32) * 5) enum rpcrdma_errcode { ERR_VERS = 1, -- cgit v1.2.3 From a6081b82c533d78041acb76738716aa7dafb339a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:06:38 -0500 Subject: svcrdma: Make RDMA_ERROR messages work Fix several issues with svc_rdma_send_error(): - Post a receive buffer to replace the one that was consumed by the incoming request - Posting a send should use DMA_TO_DEVICE, not DMA_FROM_DEVICE - No need to put_page _and_ free pages in svc_rdma_put_context - Make sure the sge is set up completely in case the error path goes through svc_rdma_unmap_dma() - Replace the use of ENOSYS, which has a reserved meaning Related fixes in svc_rdma_recvfrom(): - Don't leak the ctxt associated with the incoming request - Don't close the connection after sending an error reply - Let svc_rdma_send_error() figure out the right header error code As a last clean up, move svc_rdma_send_error() to svc_rdma_sendto.c with other similar functions. There is some common logic in these functions that could someday be combined to reduce code duplication. Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Tested-by: Devesh Sharma Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index aef47dd2bd1a..42e852230a03 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -228,11 +228,11 @@ extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, extern int svc_rdma_sendto(struct svc_rqst *); extern struct rpcrdma_read_chunk * svc_rdma_get_read_chunk(struct rpcrdma_msg *); +extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, + int); /* svc_rdma_transport.c */ extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); -extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, - enum rpcrdma_errcode); extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_repost_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); -- cgit v1.2.3 From f3ea53fb3bc3908b6e9ef39e53a75b55df7f78f8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:06:47 -0500 Subject: svcrdma: Use correct XID in error replies When constructing an error reply, svc_rdma_xdr_encode_error() needs to view the client's request message so it can get the failing request's XID. svc_rdma_xdr_decode_req() is supposed to return a pointer to the client's request header. But if it fails to decode the client's message (and thus an error reply is needed) it does not return the pointer. The server then sends a bogus XID in the error reply. Instead, unconditionally generate the pointer to the client's header in svc_rdma_recvfrom(), and pass that pointer to both functions. Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Tested-by: Devesh Sharma Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 42e852230a03..c2b0d95602d8 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -199,7 +199,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct xdr_buf *rcvbuf); /* svc_rdma_marshal.c */ -extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); +extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg *, struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode, __be32 *); -- cgit v1.2.3 From 8bd5ba86d9ba7169e137fc4f32c553080c056a02 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:07:13 -0500 Subject: svcrdma: Use new CQ API for RPC-over-RDMA server receive CQs Calling ib_poll_cq() to sort through WCs during a completion is a common pattern amongst RDMA consumers. Since commit 14d3a3b2498e ("IB: add a proper completion queue abstraction"), WC sorting can be handled by the IB core. By converting to this new API, svcrdma is made a better neighbor to other RDMA consumers, as it allows the core to schedule the delivery of completions more fairly amongst all active consumers. Because each ib_cqe carries a pointer to a completion method, the core can now post operations on a consumer's QP, and handle the completions itself. svcrdma receive completions no longer use the dto_tasklet. Each polled Receive WC is now handled individually in soft IRQ context. The server transport's rdma_stat_rq_poll and rdma_stat_rq_prod metrics are no longer updated. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c2b0d95602d8..cf79ab86d3d4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -75,6 +75,7 @@ struct svc_rdma_op_ctxt { struct svc_rdma_fastreg_mr *frmr; int hdr_count; struct xdr_buf arg; + struct ib_cqe cqe; struct list_head dto_q; enum ib_wr_opcode wr_op; enum ib_wc_status wc_status; @@ -174,7 +175,6 @@ struct svcxprt_rdma { struct work_struct sc_work; }; /* sc_flags */ -#define RDMAXPRT_RQ_PENDING 1 #define RDMAXPRT_SQ_PENDING 2 #define RDMAXPRT_CONN_PENDING 3 -- cgit v1.2.3 From be99bb11400ce02552c35a6d3bf054de393ce30e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Mar 2016 13:07:22 -0500 Subject: svcrdma: Use new CQ API for RPC-over-RDMA server send CQs Calling ib_poll_cq() to sort through WCs during a completion is a common pattern amongst RDMA consumers. Since commit 14d3a3b2498e ("IB: add a proper completion queue abstraction"), WC sorting can be handled by the IB core. By converting to this new API, svcrdma is made a better neighbor to other RDMA consumers, as it allows the core to schedule the delivery of completions more fairly amongst all active consumers. This new API also aims each completion at a function that is specific to the WR's opcode. Thus the ctxt->wr_op field and the switch in process_context is replaced by a set of methods that handle each completion type. Because each ib_cqe carries a pointer to a completion method, the core can now post operations on a consumer's QP, and handle the completions itself. The server's rdma_stat_sq_poll and rdma_stat_sq_prod metrics are no longer updated. As a clean up, the cq_event_handler, the dto_tasklet, and all associated locking is removed, as they are no longer referenced or used. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index cf79ab86d3d4..3081339968c3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -76,8 +76,9 @@ struct svc_rdma_op_ctxt { int hdr_count; struct xdr_buf arg; struct ib_cqe cqe; + struct ib_cqe reg_cqe; + struct ib_cqe inv_cqe; struct list_head dto_q; - enum ib_wr_opcode wr_op; enum ib_wc_status wc_status; u32 byte_len; u32 position; @@ -175,7 +176,6 @@ struct svcxprt_rdma { struct work_struct sc_work; }; /* sc_flags */ -#define RDMAXPRT_SQ_PENDING 2 #define RDMAXPRT_CONN_PENDING 3 #define RPCRDMA_LISTEN_BACKLOG 10 @@ -232,6 +232,11 @@ extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, int); /* svc_rdma_transport.c */ +extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *); +extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *); +extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *); +extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *); +extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *); extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_repost_recv(struct svcxprt_rdma *, gfp_t); -- cgit v1.2.3