From 2f0f88f42f2eab0421ed37d7494de9124fdf0d34 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Jul 2021 10:03:10 -0400 Subject: SUNRPC: Add svc_rqst_replace_page() API Replacing a page in rq_pages[] requires a get_page(), which is a bus-locked operation, and a put_page(), which can be even more costly. To reduce the cost of replacing a page in rq_pages[], batch the put_page() operations by collecting "freed" pages in a pagevec, and then release those pages when the pagevec is full. This pagevec is also emptied when each RPC completes. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e91d51ea028b..ab9afbf0a0d8 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -19,6 +19,7 @@ #include #include #include +#include /* statistics for svc_pool structures */ struct svc_pool_stats { @@ -256,6 +257,7 @@ struct svc_rqst { struct page * *rq_next_page; /* next reply page to use */ struct page * *rq_page_end; /* one past the last page */ + struct pagevec rq_pvec; struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; @@ -502,6 +504,8 @@ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node); +void svc_rqst_replace_page(struct svc_rqst *rqstp, + struct page *page); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); unsigned int svc_pool_map_get(void); -- cgit v1.2.3 From 6c8c84f525100a1cade5698320b4abe43062e159 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 7 Jul 2021 14:57:28 -0400 Subject: svcrdma: Fewer calls to wake_up() in Send completion handler Because wake_up() takes an IRQ-safe lock, it can be expensive, especially to call inside of a single-threaded completion handler. What's more, the Send wait queue almost never has waiters, so most of the time, this is an expensive no-op. As always, the goal is to reduce the average overhead of each completion, because a transport's completion handlers are single- threaded on one CPU core. This change reduces CPU utilization of the Send completion thread by 2-3% on my server. Signed-off-by: Chuck Lever Reviewed-By: Tom Talpey --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3184465de3a0..57c60ffe76dd 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -207,6 +207,7 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, struct svc_rdma_recv_ctxt *rctxt, int status); +extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); -- cgit v1.2.3 From b6c2bfea096ba22583f1071c10ce0745804b9b95 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 9 Feb 2021 10:32:20 -0500 Subject: svcrdma: Relieve contention on sc_send_lock. /proc/lock_stat indicates the the sc_send_lock is heavily contended when the server is under load from a single client. To address this, convert the send_ctxt free list to an llist. Returning an item to the send_ctxt cache is now waitless, which reduces the instruction path length in the single-threaded Send handler (svc_rdma_wc_send). The goal is to enable the ib_comp_wq worker to handle a higher RPC/RDMA Send completion rate given the same CPU resources. This change reduces CPU utilization of Send completion by 2-3% on my server. Signed-off-by: Chuck Lever Reviewed-By: Tom Talpey --- include/linux/sunrpc/svc_rdma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57c60ffe76dd..5f8d5af6556c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -90,7 +90,7 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; spinlock_t sc_send_lock; - struct list_head sc_send_ctxts; + struct llist_head sc_send_ctxts; spinlock_t sc_rw_ctxt_lock; struct list_head sc_rw_ctxts; @@ -150,7 +150,7 @@ struct svc_rdma_recv_ctxt { }; struct svc_rdma_send_ctxt { - struct list_head sc_list; + struct llist_node sc_node; struct rpc_rdma_cid sc_cid; struct ib_send_wr sc_send_wr; -- cgit v1.2.3 From 07a92d009f0b1557d3d58905ce18821a483be2e1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 8 Feb 2021 15:33:16 -0500 Subject: svcrdma: Convert rdma->sc_rw_ctxts to llist Relieve contention on sc_rw_ctxt_lock by converting rdma->sc_rw_ctxts to an llist. The goal is to reduce the average overhead of Send completions, because a transport's completion handlers are single-threaded on one CPU core. This change reduces CPU utilization of each Send completion by 2-3% on my server. Signed-off-by: Chuck Lever Reviewed-By: Tom Talpey --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 5f8d5af6556c..24aa159d29a7 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -92,7 +92,7 @@ struct svcxprt_rdma { spinlock_t sc_send_lock; struct llist_head sc_send_ctxts; spinlock_t sc_rw_ctxt_lock; - struct list_head sc_rw_ctxts; + struct llist_head sc_rw_ctxts; u32 sc_pending_recvs; u32 sc_recv_batch; -- cgit v1.2.3 From 5c11720767f70d34357d00a15ba5a0ad052c40fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 5 Aug 2021 15:11:24 -0400 Subject: SUNRPC: Fix a NULL pointer deref in trace_svc_stats_latency() Some paths through svc_process() leave rqst->rq_procinfo set to NULL, which triggers a crash if tracing happens to be enabled. Fixes: 89ff87494c6e ("SUNRPC: Display RPC procedure names instead of proc numbers") Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ab9afbf0a0d8..f0f846fa396e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -527,6 +527,7 @@ void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); +const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); -- cgit v1.2.3 From a2071573d6346819cc4e5787b4206f2184985160 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 3 Aug 2021 12:59:36 +0200 Subject: sysctl: introduce new proc handler proc_dobool This is to let bool variable could be correctly displayed in big/little endian sysctl procfs. sizeof(bool) is arch dependent, proc_dobool should work in all arches. Suggested-by: Pan Xinhui Signed-off-by: Jia He [thuth: rebased the patch to the current kernel version] Signed-off-by: Thomas Huth Signed-off-by: Chuck Lever --- include/linux/sysctl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d99ca99837de..1fa2b69c6fc3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -48,6 +48,8 @@ typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); -- cgit v1.2.3 From b4ab2fea7c797b0b8b92332c7e315703c12d37d8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 30 Jul 2021 16:07:36 -0400 Subject: SUNRPC: Add RPC_AUTH_TLS protocol numbers Shared by client and server. See: https://www.iana.org/assignments/rpc-authentication-numbers/rpc-authentication-numbers.xhtml Signed-off-by: Chuck Lever --- include/linux/sunrpc/msg_prot.h | 1 + include/linux/sunrpc/xdr.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 938c2bf29db8..02117ed0fa2e 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -20,6 +20,7 @@ enum rpc_auth_flavors { RPC_AUTH_DES = 3, RPC_AUTH_KRB = 4, RPC_AUTH_GSS = 6, + RPC_AUTH_TLS = 7, RPC_AUTH_MAXFLAVOR = 8, /* pseudoflavors: */ RPC_AUTH_GSS_KRB5 = 390003, diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index a965cbc136ad..b519609af1d0 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -95,6 +95,7 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) #define rpc_auth_unix cpu_to_be32(RPC_AUTH_UNIX) #define rpc_auth_short cpu_to_be32(RPC_AUTH_SHORT) #define rpc_auth_gss cpu_to_be32(RPC_AUTH_GSS) +#define rpc_auth_tls cpu_to_be32(RPC_AUTH_TLS) #define rpc_call cpu_to_be32(RPC_CALL) #define rpc_reply cpu_to_be32(RPC_REPLY) -- cgit v1.2.3 From a4ae308143961bf688e1c8a62f6604e62b491120 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 5 Aug 2021 10:25:49 -0400 Subject: SUNRPC: Move client-side disconnect injection Disconnect injection stress-tests the ability for both client and server implementations to behave resiliently in the face of network instability. Convert the existing client-side disconnect injection infrastructure to use the kernel's generic error injection facility. The generic facility has a richer set of injection criteria. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xprt.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c8c39f22d3b1..b15c1f07162d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -288,7 +288,6 @@ struct rpc_xprt { const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *debugfs; /* debugfs directory */ - atomic_t inject_disconnect; #endif struct rcu_head rcu; const struct xprt_class *xprt_class; @@ -502,21 +501,4 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) return test_and_set_bit(XPRT_BINDING, &xprt->state); } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -extern unsigned int rpc_inject_disconnect; -static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) -{ - if (!rpc_inject_disconnect) - return; - if (atomic_dec_return(&xprt->inject_disconnect)) - return; - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); - xprt->ops->inject_disconnect(xprt); -} -#else -static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) -{ -} -#endif - #endif /* _LINUX_SUNRPC_XPRT_H */ -- cgit v1.2.3 From 2dc6f19e4f438d4c14987cb17aee38aaf7304e7f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 23 Aug 2021 12:01:18 -0400 Subject: nlm: minor nlm_lookup_file argument change It'll come in handy to get the whole nlm_lock. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 666f5f310a04..81b71ad2040a 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -286,7 +286,7 @@ void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); * File handling for the server personality */ __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nfs_fh *); + struct nlm_lock *); void nlm_release_file(struct nlm_file *); void nlmsvc_release_lockowner(struct nlm_lock *); void nlmsvc_mark_resources(struct net *); -- cgit v1.2.3 From 7f024fcd5c97dc70bb9121c80407cf3cf9be7159 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 23 Aug 2021 16:44:00 -0400 Subject: Keep read and write fds with each nlm_file We shouldn't really be using a read-only file descriptor to take a write lock. Most filesystems will put up with it. But NFS, for example, won't. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 3 ++- include/linux/lockd/lockd.h | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 0520c0cd73f4..3bc9f7410e21 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -27,7 +27,8 @@ struct rpc_task; struct nlmsvc_binding { __be32 (*fopen)(struct svc_rqst *, struct nfs_fh *, - struct file **); + struct file **, + int mode); void (*fclose)(struct file *); }; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 81b71ad2040a..c4ae6506b8b3 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -10,6 +10,8 @@ #ifndef LINUX_LOCKD_LOCKD_H #define LINUX_LOCKD_LOCKD_H +/* XXX: a lot of this should really be under fs/lockd. */ + #include #include #include @@ -154,7 +156,8 @@ struct nlm_rqst { struct nlm_file { struct hlist_node f_list; /* linked list */ struct nfs_fh f_handle; /* NFS file handle */ - struct file * f_file; /* VFS file pointer */ + struct file * f_file[2]; /* VFS file pointers, + indexed by O_ flags */ struct nlm_share * f_shares; /* DOS shares */ struct list_head f_blocks; /* blocked locks */ unsigned int f_locks; /* guesstimate # of locks */ @@ -267,6 +270,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); /* * Server-side lock handling */ +int lock_to_openmode(struct file_lock *); __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, int, struct nlm_cookie *, int); @@ -301,7 +305,8 @@ int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(file->f_file); + return locks_inode(file->f_file[O_RDONLY] ? + file->f_file[O_RDONLY] : file->f_file[O_WRONLY]); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) -- cgit v1.2.3 From f657f8eef3ff870552c9fd2839e0061046f44618 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:02:04 -0400 Subject: nfs: don't atempt blocking locks on nfs reexports NFS implements blocking locks by blocking inside its lock method. In the reexport case, this blocks the nfs server thread, which could lead to deadlocks since an nfs server thread might be required to unlock the conflicting lock. It also causes a crash, since the nfs server thread assumes it can free the lock when its lm_notify lock callback is called. Ideal would be to make the nfs lock method return without blocking in this case, but for now it works just not to attempt blocking locks. The difference is just that the original client will have to poll (as it does in the v4.0 case) instead of getting a callback when the lock's available. Signed-off-by: J. Bruce Fields Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- include/linux/exportfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fe848901fcc3..3260fe714846 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -221,6 +221,8 @@ struct export_operations { #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ +#define EXPORT_OP_SYNC_LOCKS (0x20) /* Filesystem can't do + asychronous blocking locks */ unsigned long flags; }; -- cgit v1.2.3 From bb0a55bb7148a49e549ee992200860e7a040d3a5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:02:06 -0400 Subject: nfs: don't allow reexport reclaims In the reexport case, nfsd is currently passing along locks with the reclaim bit set. The client sends a new lock request, which is granted if there's currently no conflict--even if it's possible a conflicting lock could have been briefly held in the interim. We don't currently have any way to safely grant reclaim, so for now let's just deny them all. I'm doing this by passing the reclaim bit to nfs and letting it fail the call, with the idea that eventually the client might be able to do something more forgiving here. Signed-off-by: J. Bruce Fields Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- include/linux/errno.h | 1 + include/linux/fs.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/errno.h b/include/linux/errno.h index d73f597a2484..8b0c754bab02 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h @@ -31,5 +31,6 @@ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ #define ERECALLCONFLICT 530 /* conflict with recalled state */ +#define ENOGRACE 531 /* NFS file lock reclaim refused */ #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..1f5c3dbce1da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -997,6 +997,7 @@ static inline struct file *get_file(struct file *f) #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) -- cgit v1.2.3