From 18c64378ad85ef00e70f196793ee8901a8aa2fa1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Apr 2025 10:22:14 -0400 Subject: sunrpc: add info about xprt queue times to svc_xprt_dequeue tracepoint I've been looking at a problem where we see increased RPC timeouts in clients when the nfs_layout_flexfiles dataserver_timeo value is tuned very low (6s). This is necessary to ensure quick failover to a different mirror if a server goes down, but it causes a lot more major RPC timeouts. Ultimately, the problem is server-side however. It's sometimes doesn't respond to connection attempts. My theory is that the interrupt handler runs when a connection comes in, the xprt ends up being enqueued, but it takes a significant amount of time for the nfsd thread to pick it up. Currently, the svc_xprt_dequeue tracepoint displays "wakeup-us". This is the time between the wake_up() call, and the thread dequeueing the xprt. If no thread was woken, or the thread ended up picking up a different xprt than intended, then this value won't tell us how long the xprt was waiting. Add a new xpt_qtime field to struct svc_xprt and set it in svc_xprt_enqueue(). When the dequeue tracepoint fires, also store the time that the xprt sat on the queue in total. Display it as "qtime-us". Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 72be60952579..369a89aea186 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -53,6 +53,7 @@ struct svc_xprt { struct svc_xprt_class *xpt_class; const struct svc_xprt_ops *xpt_ops; struct kref xpt_ref; + ktime_t xpt_qtime; struct list_head xpt_list; struct lwq_node xpt_ready; unsigned long xpt_flags; -- cgit v1.2.3 From eff042ddf4b9587fa7394d502b93e06ec6015177 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:50 -0400 Subject: sunrpc: Add a helper to derive maxpages from sv_max_mesg This page count is to be used to allocate various arrays of pages and bio_vecs, replacing the fixed RPCSVC_MAXPAGES value. The documenting comment is somewhat stale -- of course NFSv4 COMPOUND procedures may have multiple payloads. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 74658cca0f38..749f8e53e8a6 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -159,6 +159,23 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); #define RPCSVC_MAXPAGES ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE \ + 2 + 1) +/** + * svc_serv_maxpages - maximum count of pages needed for one RPC message + * @serv: RPC service context + * + * Returns a count of pages or vectors that can hold the maximum + * size RPC message for @serv. + * + * Each request/reply pair can have at most one "payload", plus two + * pages, one for the request, and one for the reply. + * nfsd_splice_actor() might need an extra page when a READ payload + * is not page-aligned. + */ +static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) +{ + return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1; +} + /* * The context of a single thread, including the request currently being * processed. -- cgit v1.2.3 From ed603bcf4feac05df937bf78bc7feb75f988a971 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:52 -0400 Subject: sunrpc: Replace the rq_pages array with dynamically-allocated memory As a step towards making NFSD's maximum rsize and wsize variable at run-time, replace the fixed-size rq_vec[] array in struct svc_rqst with a chunk of dynamically-allocated memory. On a system with 8-byte pointers and 4KB pages, pahole reports that the rq_pages[] array is 2080 bytes. This patch replaces that with a single 8-byte pointer field. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 749f8e53e8a6..57be69539888 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -205,7 +205,8 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct page *rq_scratch_page; struct xdr_buf rq_res; - struct page *rq_pages[RPCSVC_MAXPAGES + 1]; + unsigned long rq_maxpages; /* num of entries in rq_pages */ + struct page * *rq_pages; struct page * *rq_respages; /* points into rq_pages */ struct page * *rq_next_page; /* next reply page to use */ struct page * *rq_page_end; /* one past the last page */ -- cgit v1.2.3 From 59cf7346542babdaae99e72365174eab4fa276ac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:54 -0400 Subject: sunrpc: Replace the rq_bvec array with dynamically-allocated memory As a step towards making NFSD's maximum rsize and wsize variable at run-time, replace the fixed-size rq_bvec[] array in struct svc_rqst with a chunk of dynamically-allocated memory. The rq_bvec[] array contains enough bio_vecs to handle each page in a maximum size RPC message. On a system with 8-byte pointers and 4KB pages, pahole reports that the rq_bvec[] array is 4144 bytes. This patch replaces that array with a single 8-byte pointer field. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 57be69539888..538bea716c6b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -213,7 +213,7 @@ struct svc_rqst { struct folio_batch rq_fbatch; struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ - struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; + struct bio_vec *rq_bvec; __be32 rq_xid; /* transmission id */ u32 rq_prog; /* program number */ -- cgit v1.2.3 From f2e597353d50f05a600dddbf84a362839cf1c224 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 9 May 2025 13:39:23 -0400 Subject: NFSD: De-duplicate the svc_fill_write_vector() call sites All three call sites do the same thing. I'm struggling with this a bit, however. struct xdr_buf is an XDR layer object and unmarshaling a WRITE payload is clearly a task intended to be done by the proc and xdr functions, not by VFS. This feels vaguely like a layering violation. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 538bea716c6b..dec636345ee2 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -471,7 +471,7 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct xdr_buf *payload); + const struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -- cgit v1.2.3 From b406c6b78198f123e08061c45d56803459785975 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 8 May 2025 11:56:40 -0400 Subject: SUNRPC: Remove svc_fill_write_vector() Clean up: This API is no longer used. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index dec636345ee2..510ee1927977 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -470,8 +470,6 @@ const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - const struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -- cgit v1.2.3 From 1259560b988c959abf9b84da0b9e56c2863b6837 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 6 May 2025 20:15:08 -0400 Subject: SUNRPC: Remove svc_rqst :: rq_vec Clean up: This array is no longer used. On a system with 8-byte pointers and 4KB pages, pahole reports that the rq_vec[] array accounts for 4144 bytes. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 510ee1927977..6540af4b9bdb 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -212,7 +212,6 @@ struct svc_rqst { struct page * *rq_page_end; /* one past the last page */ struct folio_batch rq_fbatch; - struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ struct bio_vec *rq_bvec; __be32 rq_xid; /* transmission id */ -- cgit v1.2.3 From f4126823c1fd677d9811a0252830c6c73ddb7e99 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:55 -0400 Subject: sunrpc: Adjust size of socket's receive page array dynamically As a step towards making NFSD's maximum rsize and wsize variable at run-time, make sk_pages a flexible array. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index bf45d9e8492a..963bbe251e52 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -40,7 +40,9 @@ struct svc_sock { struct completion sk_handshake_done; - struct page * sk_pages[RPCSVC_MAXPAGES]; /* received data */ + /* received data */ + unsigned long sk_maxpages; + struct page * sk_pages[] __counted_by(sk_maxpages); }; static inline u32 svc_sock_reclen(struct svc_sock *svsk) -- cgit v1.2.3 From 81381d1a90dea186ce4585d9bd3cdc40b50e9c48 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:56 -0400 Subject: svcrdma: Adjust the number of entries in svc_rdma_recv_ctxt::rc_pages Allow allocation of more entries in the rc_pages[] array when the maximum size of an RPC message is increased. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 619fc0bd837a..1016f2feddc4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -202,7 +202,8 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_pcl rc_reply_pcl; unsigned int rc_page_count; - struct page *rc_pages[RPCSVC_MAXPAGES]; + unsigned long rc_maxpages; + struct page *rc_pages[] __counted_by(rc_maxpages); }; /* -- cgit v1.2.3 From 56ab43f50d708e155f013ef60cc1b4fe39bed42e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:57 -0400 Subject: svcrdma: Adjust the number of entries in svc_rdma_send_ctxt::sc_pages Allow allocation of more entries in the sc_pages[] array when the maximum size of an RPC message is increased. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 1016f2feddc4..22704c2e5b9b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -245,7 +245,8 @@ struct svc_rdma_send_ctxt { void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; - struct page *sc_pages[RPCSVC_MAXPAGES]; + unsigned long sc_maxpages; + struct page **sc_pages; struct ib_sge sc_sges[]; }; -- cgit v1.2.3 From 0af165bb903cdc005066494cc931076dafd76894 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:58 -0400 Subject: sunrpc: Remove the RPCSVC_MAXPAGES macro It is no longer used. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6540af4b9bdb..d57df042e24a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -150,14 +150,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * list. xdr_buf.tail points to the end of the first page. * This assumes that the non-page part of an rpc reply will fit * in a page - NFSd ensures this. lockd also has no trouble. - * - * Each request/reply pair can have at most one "payload", plus two pages, - * one for the request, and one for the reply. - * We using ->sendfile to return read data, we might need one extra page - * if the request is not page-aligned. So add another '1'. */ -#define RPCSVC_MAXPAGES ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE \ - + 2 + 1) /** * svc_serv_maxpages - maximum count of pages needed for one RPC message -- cgit v1.2.3 From 1e7dbad6d1fe4b35ccd2aa8fea9496d837af4430 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:37:02 -0400 Subject: SUNRPC: Bump the maximum payload size for the server Increase the maximum server-side RPC payload to 4MB. The default remains at 1MB. An API to adjust the operational maximum was added in 2006 by commit 596bbe53eb3a ("[PATCH] knfsd: Allow max size of NFSd payload to be configured"). To adjust the operational maximum using this API, shut down the NFS server. Then echo a new value into: /proc/fs/nfsd/max_block_size And restart the NFS server. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d57df042e24a..48666b83fe68 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -119,14 +119,14 @@ void svc_destroy(struct svc_serv **svcp); * Linux limit; someone who cares more about NFS/UDP performance * can test a larger number. * - * For TCP transports we have more freedom. A size of 1MB is - * chosen to match the client limit. Other OSes are known to - * have larger limits, but those numbers are probably beyond - * the point of diminishing returns. + * For non-UDP transports we have more freedom. A size of 4MB is + * chosen to accommodate clients that support larger I/O sizes. */ -#define RPCSVC_MAXPAYLOAD (1*1024*1024u) -#define RPCSVC_MAXPAYLOAD_TCP RPCSVC_MAXPAYLOAD -#define RPCSVC_MAXPAYLOAD_UDP (32*1024u) +enum { + RPCSVC_MAXPAYLOAD = 4 * 1024 * 1024, + RPCSVC_MAXPAYLOAD_TCP = RPCSVC_MAXPAYLOAD, + RPCSVC_MAXPAYLOAD_UDP = 32 * 1024, +}; extern u32 svc_max_payload(const struct svc_rqst *rqstp); -- cgit v1.2.3