From 55b6dd54c3bcb6edf7ad630a4510759f4b0cf1cd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:39 -0500 Subject: nfsd/sunrpc: add svc_rqst->rq_private pointer and remove rq_lease_breaker rq_lease_breaker has always been a NFSv4 specific layering violation in svc_rqst. The reason it's there though is that we need a place that is thread-local, and accessible from the svc_rqst pointer. Add a new rq_private pointer to struct svc_rqst. This is intended for use by the threads that are handling the service. sunrpc code doesn't touch it. In nfsd, define a new struct nfsd_thread_local_info. nfsd declares one of these on the stack and puts a pointer to it in rq_private. Add a new ntli_lease_breaker field to the new struct and convert all of the places that access rq_lease_breaker to use the new field instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..ab8237ba9596 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -175,6 +175,9 @@ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) /* * The context of a single thread, including the request currently being * processed. + * + * RPC programs are free to use rq_private to stash thread-local information. + * The sunrpc layer will not access it. */ struct svc_rqst { struct list_head rq_all; /* all threads list */ @@ -251,7 +254,7 @@ struct svc_rqst { unsigned long bc_to_initval; unsigned int bc_to_retries; unsigned int rq_status_counter; /* RPC processing counter */ - void **rq_lease_breaker; /* The v4 client breaking a lease */ + void *rq_private; /* For use by the service thread */ }; /* bits for rq_flags */ -- cgit v1.2.3 From 322ecd01bf8ad7e0da21e174679aff1759e68b2c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:40 -0500 Subject: nfsd/sunrpc: move rq_cachetype into struct nfsd_thread_local_info The svc_rqst->rq_cachetype field is only accessed by nfsd. Move it into the nfsd_thread_local_info instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ab8237ba9596..62152e4f3bcc 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -218,7 +218,6 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ - int rq_cachetype; /* catering to nfsd */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ -- cgit v1.2.3 From 153b9e025308417d167332c93e1bcc11174178de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:23 -0500 Subject: lockd: Relocate and rename nlm_drop_reply The nlm_drop_reply status code is internal to the kernel's lockd implementation and must never appear on the wire. Its previous location in xdr.h grouped it with legitimate NLM protocol status codes, obscuring this critical distinction. Relocate the definition to lockd.h with a comment block for internal status codes, and rename to nlm__int__drop_reply to make its internal-only nature explicit. This prepares for adding additional internal status codes in subsequent patches. Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 6 ++++++ include/linux/lockd/xdr.h | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 330e38776bb2..fdefec39553f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -38,6 +38,12 @@ */ #define LOCKD_DFLT_TIMEO 10 +/* + * Internal-use status codes, not to be placed on the wire. + * Version handlers translate these to appropriate wire values. + */ +#define nlm__int__drop_reply cpu_to_be32(30000) + /* * Lockd host handle (used both by the client and server personality). */ diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 17d53165d9f2..292e4e38d17d 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -33,8 +33,6 @@ struct svc_rqst; #define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) #define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) -#define nlm_drop_reply cpu_to_be32(30000) - /* Lock info passed via NLM */ struct nlm_lock { char * caller; -- cgit v1.2.3 From 9e0d0c61940796893e0c2200cdc7be0684218238 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:24 -0500 Subject: lockd: Introduce nlm__int__deadlock The use of CONFIG_LOCKD_V4 in combination with a later cast_status() in the NLMv3 code is difficult to reason about. Instead, replace the use of nlm_deadlock with an implementation-defined status value that version-specific code translates appropriately. The new approach establishes a translation boundary: generic lockd code returns nlm__int__deadlock when posix_lock_file() yields -EDEADLK. Version-specific handlers (svc4proc.c for NLMv4, svcproc.c for NLMv3) translate this internal status to the appropriate wire protocol value. NLMv4 maps to nlm4_deadlock; NLMv3 maps to nlm_lck_denied (since NLMv3 lacks a deadlock-specific status code). Later this modification will also remove the need to include NLMv4 headers in NLMv3 and generic code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index fdefec39553f..793691912137 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -43,6 +43,7 @@ * Version handlers translate these to appropriate wire values. */ #define nlm__int__drop_reply cpu_to_be32(30000) +#define nlm__int__deadlock cpu_to_be32(30001) /* * Lockd host handle (used both by the client and server personality). -- cgit v1.2.3 From 7db001e03d7a668ca6c3789fee42a24236ca90f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:25 -0500 Subject: lockd: Have nlm_fopen() return errno values The nlm_fopen() function is part of the API between nfsd and lockd. Currently its return value is an on-the-wire NLM status code. But that forces NFSD to include NLM wire protocol definitions despite having no other dependency on the NLM wire protocol. In addition, a CONFIG_LOCKD_V4 Kconfig symbol appears in the middle of NFSD source code. Refactor: Let's not use on-the-wire values as part of a high-level API between two Linux kernel modules. That's what we have errno for, right? And, instead of simply moving the CONFIG_LOCKD_V4 check, we can get rid of it entirely and let the decision of what actual NLM status code goes on the wire to be left up to NLM version-specific code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 8 +++----- include/linux/lockd/lockd.h | 2 ++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index c53c81242e72..2f5dd9e943ee 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -26,11 +26,9 @@ struct rpc_clnt; * This is the set of functions for lockd->nfsd communication */ struct nlmsvc_binding { - __be32 (*fopen)(struct svc_rqst *, - struct nfs_fh *, - struct file **, - int mode); - void (*fclose)(struct file *); + int (*fopen)(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags); + void (*fclose)(struct file *filp); }; extern const struct nlmsvc_binding *nlmsvc_ops; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 793691912137..195e6ce28f6e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -44,6 +44,8 @@ */ #define nlm__int__drop_reply cpu_to_be32(30000) #define nlm__int__deadlock cpu_to_be32(30001) +#define nlm__int__stale_fh cpu_to_be32(30002) +#define nlm__int__failed cpu_to_be32(30003) /* * Lockd host handle (used both by the client and server personality). -- cgit v1.2.3 From efb5b15e3b78f5644dd2d4ddec8880e0c9aa5b5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:26 -0500 Subject: lockd: Relocate nlmsvc_unlock API declarations The nlmsvc_unlock_all_by_sb() and nlmsvc_unlock_all_by_ip() functions are part of lockd's external API, consumed by other kernel subsystems. Their declarations currently reside in linux/lockd/lockd.h alongside internal implementation details, which blurs the boundary between lockd's public interface and its private internals. Moving these declarations to linux/lockd/bind.h groups them with other external API functions and makes the separation explicit. This clarifies which functions are intended for external use and reduces the risk of internal implementation details leaking into the public API surface. Build-tested with allyesconfig; no functional changes. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 7 +++++++ include/linux/lockd/lockd.h | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 2f5dd9e943ee..82eca0a13ccc 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -21,6 +21,7 @@ struct svc_rqst; struct rpc_task; struct rpc_clnt; +struct super_block; /* * This is the set of functions for lockd->nfsd communication @@ -80,4 +81,10 @@ extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, vo extern int lockd_up(struct net *net, const struct cred *cred); extern void lockd_down(struct net *net); +/* + * Cluster failover support + */ +int nlmsvc_unlock_all_by_sb(struct super_block *sb); +int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); + #endif /* LINUX_LOCKD_BIND_H */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 195e6ce28f6e..0d883f48ec21 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -311,12 +311,6 @@ void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); -/* - * Cluster failover support - */ -int nlmsvc_unlock_all_by_sb(struct super_block *sb); -int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); - static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? -- cgit v1.2.3 From 840621fd2ff23ada8b9262d90477e75232566e6b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:27 -0500 Subject: NFS: Use nlmclnt_shutdown_rpc_clnt() to safely shut down NLM A race condition exists in shutdown_store() when writing to the sysfs "shutdown" file concurrently with nlm_shutdown_hosts_net(). Without synchronization, the following sequence can occur: 1. shutdown_store() reads server->nlm_host (non-NULL) 2. nlm_shutdown_hosts_net() acquires nlm_host_mutex, calls rpc_shutdown_client(), sets h_rpcclnt to NULL, and potentially frees the host via nlm_gc_hosts() 3. shutdown_store() dereferences the now-stale or freed host Introduce nlmclnt_shutdown_rpc_clnt(), which acquires nlm_host_mutex before accessing h_rpcclnt. This synchronizes with nlm_shutdown_hosts_net() and ensures the rpc_clnt pointer remains valid during the shutdown operation. This change also improves API layering: NFS client code no longer needs to include the internal lockd header to access nlm_host fields. The new helper resides in bind.h alongside other public lockd interfaces. Reported-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 82eca0a13ccc..39c124dcb19c 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -57,6 +57,7 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); extern struct rpc_clnt *nlmclnt_rpc_clnt(struct nlm_host *host); +extern void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host); /* * NLM client operations provide a means to modify RPC processing of NLM -- cgit v1.2.3 From f4d5f8caadd858f11b21e8a9e5c85290fc21a568 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:28 -0500 Subject: lockd: Move xdr4.h from include/linux/lockd/ to fs/lockd/ The xdr4.h header declares NLMv4-specific XDR encoder/decoder functions and error codes that are used exclusively within the lockd subsystem. Moving it from include/linux/lockd/ to fs/lockd/ clarifies the intended scope of these declarations and prevents external code from depending on lockd-internal interfaces. This change reduces the public API surface of the lockd module and makes it easier to refactor NLMv4 internals without risk of breaking out-of-tree consumers. The header's contents are implementation details of the NLMv4 wire protocol handling, not a contract with other kernel subsystems. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 3 --- include/linux/lockd/lockd.h | 7 ++++--- include/linux/lockd/xdr4.h | 43 ------------------------------------------- 3 files changed, 4 insertions(+), 49 deletions(-) delete mode 100644 include/linux/lockd/xdr4.h (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 39c124dcb19c..077da0696f12 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -13,9 +13,6 @@ #include /* need xdr-encoded error codes too, so... */ #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif /* Dummy declarations */ struct svc_rqst; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0d883f48ec21..46f244141645 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -22,9 +22,6 @@ #include #include #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif #include #include @@ -235,6 +232,10 @@ int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, struct nlm_rqst *); void nlmclnt_next_cookie(struct nlm_cookie *); +#ifdef CONFIG_LOCKD_V4 +extern const struct rpc_version nlm_version4; +#endif + /* * Host cache */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h deleted file mode 100644 index 72831e35dca3..000000000000 --- a/include/linux/lockd/xdr4.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr4.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LOCKD_XDR4_H -#define LOCKD_XDR4_H - -#include -#include -#include -#include - -/* error codes new to NLMv4 */ -#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) -#define nlm4_rofs cpu_to_be32(NLM_ROFS) -#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) -#define nlm4_fbig cpu_to_be32(NLM_FBIG) -#define nlm4_failed cpu_to_be32(NLM_FAILED) - -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len); -bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -extern const struct rpc_version nlm_version4; - -#endif /* LOCKD_XDR4_H */ -- cgit v1.2.3 From 4db2f8a016dc9f9b357bfbf5c507c2582bb36730 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:29 -0500 Subject: lockd: Move share.h from include/linux/lockd/ to fs/lockd/ The share.h header defines struct nlm_share and declares the DOS share management functions used by the NLM server to implement NLM_SHARE and NLM_UNSHARE operations. These interfaces are used exclusively within the lockd subsystem. A git grep search confirms no external code references them. Relocating this header from include/linux/lockd/ to fs/lockd/ narrows the public API surface of the lockd module. Out-of-tree code cannot depend on these internal interfaces after this change. Future refactoring of the share management implementation thus requires no consideration of external consumers. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 2 ++ include/linux/lockd/share.h | 32 -------------------------------- 2 files changed, 2 insertions(+), 32 deletions(-) delete mode 100644 include/linux/lockd/share.h (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 46f244141645..eebcecd12fae 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -155,6 +155,8 @@ struct nlm_rqst { void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; +struct nlm_share; + /* * This struct describes a file held open by lockd on behalf of * an NFS client. diff --git a/include/linux/lockd/share.h b/include/linux/lockd/share.h deleted file mode 100644 index 1f18a9faf645..000000000000 --- a/include/linux/lockd/share.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/share.h - * - * DOS share management for lockd. - * - * Copyright (C) 1996, Olaf Kirch - */ - -#ifndef LINUX_LOCKD_SHARE_H -#define LINUX_LOCKD_SHARE_H - -/* - * DOS share for a specific file - */ -struct nlm_share { - struct nlm_share * s_next; /* linked list */ - struct nlm_host * s_host; /* client host */ - struct nlm_file * s_file; /* shared file */ - struct xdr_netobj s_owner; /* owner handle */ - u32 s_access; /* access mode */ - u32 s_mode; /* deny mode */ -}; - -__be32 nlmsvc_share_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -__be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, - nlm_host_match_fn_t); - -#endif /* LINUX_LOCKD_SHARE_H */ -- cgit v1.2.3 From 2c562c6e6715619ce34bb37d8a0a5e40fdcc7a44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:30 -0500 Subject: lockd: Relocate include/linux/lockd/lockd.h Headers placed in include/linux/ form part of the kernel's internal API and signal to subsystem maintainers that other parts of the kernel may depend on them. By moving lockd.h into fs/lockd/, lockd becomes a more self-contained module whose internal interfaces are clearly distinguished from its public contract with the rest of the kernel. This relocation addresses a long-standing XXX comment in the header itself that acknowledged the file's misplacement. Future changes to lockd internals can now proceed with confidence that external consumers are not inadvertently coupled to implementation details. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 401 -------------------------------------------- 1 file changed, 401 deletions(-) delete mode 100644 include/linux/lockd/lockd.h (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h deleted file mode 100644 index eebcecd12fae..000000000000 --- a/include/linux/lockd/lockd.h +++ /dev/null @@ -1,401 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/lockd.h - * - * General-purpose lockd include file. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_LOCKD_H -#define LINUX_LOCKD_LOCKD_H - -/* XXX: a lot of this should really be under fs/lockd. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Version string - */ -#define LOCKD_VERSION "0.5" - -/* - * Default timeout for RPC calls (seconds) - */ -#define LOCKD_DFLT_TIMEO 10 - -/* - * Internal-use status codes, not to be placed on the wire. - * Version handlers translate these to appropriate wire values. - */ -#define nlm__int__drop_reply cpu_to_be32(30000) -#define nlm__int__deadlock cpu_to_be32(30001) -#define nlm__int__stale_fh cpu_to_be32(30002) -#define nlm__int__failed cpu_to_be32(30003) - -/* - * Lockd host handle (used both by the client and server personality). - */ -struct nlm_host { - struct hlist_node h_hash; /* doubly linked list */ - struct sockaddr_storage h_addr; /* peer address */ - size_t h_addrlen; - struct sockaddr_storage h_srcaddr; /* our address (optional) */ - size_t h_srcaddrlen; - struct rpc_clnt *h_rpcclnt; /* RPC client to talk to peer */ - char *h_name; /* remote hostname */ - u32 h_version; /* interface version */ - unsigned short h_proto; /* transport proto */ - unsigned short h_reclaiming : 1, - h_server : 1, /* server side, not client side */ - h_noresvport : 1, - h_inuse : 1; - wait_queue_head_t h_gracewait; /* wait while reclaiming */ - struct rw_semaphore h_rwsem; /* Reboot recovery lock */ - u32 h_state; /* pseudo-state counter */ - u32 h_nsmstate; /* true remote NSM state */ - u32 h_pidcount; /* Pseudopids */ - refcount_t h_count; /* reference count */ - struct mutex h_mutex; /* mutex for pmap binding */ - unsigned long h_nextrebind; /* next portmap call */ - unsigned long h_expires; /* eligible for GC */ - struct list_head h_lockowners; /* Lockowners for the client */ - spinlock_t h_lock; - struct list_head h_granted; /* Locks in GRANTED state */ - struct list_head h_reclaim; /* Locks in RECLAIM state */ - struct nsm_handle *h_nsmhandle; /* NSM status handle */ - char *h_addrbuf; /* address eyecatcher */ - struct net *net; /* host net */ - const struct cred *h_cred; - char nodename[UNX_MAXNODENAME + 1]; - const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ -}; - -/* - * The largest string sm_addrbuf should hold is a full-size IPv6 address - * (no "::" anywhere) with a scope ID. The buffer size is computed to - * hold eight groups of colon-separated four-hex-digit numbers, a - * percent sign, a scope id (at most 32 bits, in decimal), and NUL. - */ -#define NSM_ADDRBUF ((8 * 4 + 7) + (1 + 10) + 1) - -struct nsm_handle { - struct list_head sm_link; - refcount_t sm_count; - char *sm_mon_name; - char *sm_name; - struct sockaddr_storage sm_addr; - size_t sm_addrlen; - unsigned int sm_monitored : 1, - sm_sticky : 1; /* don't unmonitor */ - struct nsm_private sm_priv; - char sm_addrbuf[NSM_ADDRBUF]; -}; - -/* - * Rigorous type checking on sockaddr type conversions - */ -static inline struct sockaddr *nlm_addr(const struct nlm_host *host) -{ - return (struct sockaddr *)&host->h_addr; -} - -static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) -{ - return (struct sockaddr *)&host->h_srcaddr; -} - -/* - * Map an fl_owner_t into a unique 32-bit "pid" - */ -struct nlm_lockowner { - struct list_head list; - refcount_t count; - - struct nlm_host *host; - fl_owner_t owner; - uint32_t pid; -}; - -/* - * This is the representation of a blocked client lock. - */ -struct nlm_wait { - struct list_head b_list; /* linked list */ - wait_queue_head_t b_wait; /* where to wait on */ - struct nlm_host *b_host; - struct file_lock *b_lock; /* local file lock */ - __be32 b_status; /* grant callback status */ -}; - -/* - * Memory chunk for NLM client RPC request. - */ -#define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) -struct nlm_rqst { - refcount_t a_count; - unsigned int a_flags; /* initial RPC task flags */ - struct nlm_host * a_host; /* host handle */ - struct nlm_args a_args; /* arguments */ - struct nlm_res a_res; /* result */ - struct nlm_block * a_block; - unsigned int a_retries; /* Retry count */ - u8 a_owner[NLMCLNT_OHSIZE]; - void * a_callback_data; /* sent to nlmclnt_operations callbacks */ -}; - -struct nlm_share; - -/* - * This struct describes a file held open by lockd on behalf of - * an NFS client. - */ -struct nlm_file { - struct hlist_node f_list; /* linked list */ - struct nfs_fh f_handle; /* NFS file handle */ - struct file * f_file[2]; /* VFS file pointers, - indexed by O_ flags */ - struct nlm_share * f_shares; /* DOS shares */ - struct list_head f_blocks; /* blocked locks */ - unsigned int f_locks; /* guesstimate # of locks */ - unsigned int f_count; /* reference count */ - struct mutex f_mutex; /* avoid concurrent access */ -}; - -/* - * This is a server block (i.e. a lock requested by some client which - * couldn't be granted because of a conflicting lock). - */ -#define NLM_NEVER (~(unsigned long) 0) -/* timeout on non-blocking call: */ -#define NLM_TIMEOUT (7 * HZ) - -struct nlm_block { - struct kref b_count; /* Reference count */ - struct list_head b_list; /* linked list of all blocks */ - struct list_head b_flist; /* linked list (per file) */ - struct nlm_rqst * b_call; /* RPC args & callback info */ - struct svc_serv * b_daemon; /* NLM service */ - struct nlm_host * b_host; /* host handle for RPC clnt */ - unsigned long b_when; /* next re-xmit */ - unsigned int b_id; /* block id */ - unsigned char b_granted; /* VFS granted lock */ - struct nlm_file * b_file; /* file in question */ - struct cache_req * b_cache_req; /* deferred request handling */ - struct cache_deferred_req * b_deferred_req; - unsigned int b_flags; /* block flags */ -#define B_QUEUED 1 /* lock queued */ -#define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ -#define B_TIMED_OUT 4 /* filesystem too slow to respond */ -}; - -/* - * Global variables - */ -extern const struct rpc_program nlm_program; -extern const struct svc_procedure nlmsvc_procedures[24]; -#ifdef CONFIG_LOCKD_V4 -extern const struct svc_procedure nlmsvc_procedures4[24]; -#endif -extern int nlmsvc_grace_period; -extern unsigned long nlm_timeout; -extern bool nsm_use_hostnames; -extern u32 nsm_local_state; - -extern struct timer_list nlmsvc_retry; - -/* - * Lockd client functions - */ -struct nlm_rqst * nlm_alloc_call(struct nlm_host *host); -int nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); -int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); -void nlmclnt_release_call(struct nlm_rqst *); -void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, - struct file_lock *fl); -void nlmclnt_queue_block(struct nlm_wait *block); -__be32 nlmclnt_dequeue_block(struct nlm_wait *block); -int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout); -__be32 nlmclnt_grant(const struct sockaddr *addr, - const struct nlm_lock *lock); -void nlmclnt_recovery(struct nlm_host *); -int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, - struct nlm_rqst *); -void nlmclnt_next_cookie(struct nlm_cookie *); - -#ifdef CONFIG_LOCKD_V4 -extern const struct rpc_version nlm_version4; -#endif - -/* - * Host cache - */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, - const size_t salen, - const unsigned short protocol, - const u32 version, - const char *hostname, - int noresvport, - struct net *net, - const struct cred *cred); -void nlmclnt_release_host(struct nlm_host *); -struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, - const char *hostname, - const size_t hostname_len); -void nlmsvc_release_host(struct nlm_host *); -struct rpc_clnt * nlm_bind_host(struct nlm_host *); -void nlm_rebind_host(struct nlm_host *); -struct nlm_host * nlm_get_host(struct nlm_host *); -void nlm_shutdown_hosts(void); -void nlm_shutdown_hosts_net(struct net *net); -void nlm_host_rebooted(const struct net *net, - const struct nlm_reboot *); - -/* - * Host monitoring - */ -int nsm_monitor(const struct nlm_host *host); -void nsm_unmonitor(const struct nlm_host *host); - -struct nsm_handle *nsm_get_handle(const struct net *net, - const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len); -struct nsm_handle *nsm_reboot_lookup(const struct net *net, - const struct nlm_reboot *info); -void nsm_release(struct nsm_handle *nsm); - -/* - * This is used in garbage collection and resource reclaim - * A return value != 0 means destroy the lock/block/share - */ -typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); - -/* - * Server-side lock handling - */ -int lock_to_openmode(struct file_lock *); -__be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, - struct nlm_host *, struct nlm_lock *, int, - struct nlm_cookie *, int); -__be32 nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *); -__be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_host *host, struct nlm_lock *lock, - struct nlm_lock *conflock); -__be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); -void nlmsvc_retry_blocked(struct svc_rqst *rqstp); -void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, - nlm_host_match_fn_t match); -void nlmsvc_grant_reply(struct nlm_cookie *, __be32); -void nlmsvc_release_call(struct nlm_rqst *); -void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); - -/* - * File handling for the server personality - */ -__be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nlm_lock *); -void nlm_release_file(struct nlm_file *); -void nlmsvc_put_lockowner(struct nlm_lockowner *); -void nlmsvc_release_lockowner(struct nlm_lock *); -void nlmsvc_mark_resources(struct net *); -void nlmsvc_free_host_resources(struct nlm_host *); -void nlmsvc_invalidate_all(void); - -static inline struct file *nlmsvc_file_file(const struct nlm_file *file) -{ - return file->f_file[O_RDONLY] ? - file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; -} - -static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) -{ - return file_inode(nlmsvc_file_file(file)); -} - -static inline bool -nlmsvc_file_cannot_lock(const struct nlm_file *file) -{ - return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op); -} - -static inline int __nlm_privileged_request4(const struct sockaddr *sap) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - - if (ntohs(sin->sin_port) > 1023) - return 0; - - return ipv4_is_loopback(sin->sin_addr.s_addr); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline int __nlm_privileged_request6(const struct sockaddr *sap) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ntohs(sin6->sin6_port) > 1023) - return 0; - - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_MAPPED) - return ipv4_is_loopback(sin6->sin6_addr.s6_addr32[3]); - - return ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK; -} -#else /* IS_ENABLED(CONFIG_IPV6) */ -static inline int __nlm_privileged_request6(const struct sockaddr *sap) -{ - return 0; -} -#endif /* IS_ENABLED(CONFIG_IPV6) */ - -/* - * Ensure incoming requests are from local privileged callers. - * - * Return TRUE if sender is local and is connecting via a privileged port; - * otherwise return FALSE. - */ -static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) -{ - const struct sockaddr *sap = svc_addr(rqstp); - - switch (sap->sa_family) { - case AF_INET: - return __nlm_privileged_request4(sap); - case AF_INET6: - return __nlm_privileged_request6(sap); - default: - return 0; - } -} - -/* - * Compare two NLM locks. - * When the second lock is of type F_UNLCK, this acts like a wildcard. - */ -static inline int nlm_compare_locks(const struct file_lock *fl1, - const struct file_lock *fl2) -{ - return file_inode(fl1->c.flc_file) == file_inode(fl2->c.flc_file) - && fl1->c.flc_pid == fl2->c.flc_pid - && fl1->c.flc_owner == fl2->c.flc_owner - && fl1->fl_start == fl2->fl_start - && fl1->fl_end == fl2->fl_end - &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); -} - -extern const struct lock_manager_operations nlmsvc_lock_operations; - -#endif /* LINUX_LOCKD_LOCKD_H */ -- cgit v1.2.3 From 236f3171ac690f632e13d391f47c68c3a8519bd2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:31 -0500 Subject: lockd: Remove lockd/debug.h The lockd include structure has unnecessary indirection. The header include/linux/lockd/debug.h is consumed only by fs/lockd/lockd.h, creating an extra compilation dependency and making the code harder to navigate. Fold the debug.h definitions directly into lockd.h and remove the now-redundant header. This reduces the include tree depth and makes the debug-related definitions easier to find when working on lockd internals. Build-tested with lockd built as module and built-in. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/debug.h | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 include/linux/lockd/debug.h (limited to 'include/linux') diff --git a/include/linux/lockd/debug.h b/include/linux/lockd/debug.h deleted file mode 100644 index eede2ab5246f..000000000000 --- a/include/linux/lockd/debug.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/debug.h - * - * Debugging stuff. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_DEBUG_H -#define LINUX_LOCKD_DEBUG_H - -#include - -/* - * Enable lockd debugging. - * Requires RPC_DEBUG. - */ -#undef ifdebug -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) -#else -# define ifdebug(flag) if (0) -#endif - -/* - * Debug flags - */ -#define NLMDBG_SVC 0x0001 -#define NLMDBG_CLIENT 0x0002 -#define NLMDBG_CLNTLOCK 0x0004 -#define NLMDBG_SVCLOCK 0x0008 -#define NLMDBG_MONITOR 0x0010 -#define NLMDBG_CLNTSUBS 0x0020 -#define NLMDBG_SVCSUBS 0x0040 -#define NLMDBG_HOSTCACHE 0x0080 -#define NLMDBG_XDR 0x0100 -#define NLMDBG_ALL 0x7fff - -#endif /* LINUX_LOCKD_DEBUG_H */ -- cgit v1.2.3 From 615384a24b1e6b0f091ebc1dfbf7ec8b4c27fa81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:32 -0500 Subject: lockd: Move xdr.h from include/linux/lockd/ to fs/lockd/ The lockd subsystem unnecessarily exposes internal NLM XDR type definitions through the global include path. These definitions are not used by any code outside fs/lockd/, making them inappropriate for include/linux/lockd/. Moving xdr.h to fs/lockd/ narrows the API surface and clarifies that these types are internal implementation details. The comment in linux/lockd/bind.h stating xdr.h was needed for "xdr-encoded error codes" is stale: no lockd API consumers use those codes. Forward declarations for struct nfs_fh and struct file_lock are added to bind.h because their definitions were previously pulled in transitively through xdr.h. Additionally, nfs3proc.c and proc.c need explicit includes of filelock.h for FL_CLOSE and for accessing struct file_lock members, respectively. Built and tested with lockd client/server operations. No functional change. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 5 +- include/linux/lockd/xdr.h | 113 --------------------------------------------- 2 files changed, 2 insertions(+), 116 deletions(-) delete mode 100644 include/linux/lockd/xdr.h (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 077da0696f12..ba9258c96bfd 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -11,10 +11,9 @@ #define LINUX_LOCKD_BIND_H #include -/* need xdr-encoded error codes too, so... */ -#include -/* Dummy declarations */ +struct file_lock; +struct nfs_fh; struct svc_rqst; struct rpc_task; struct rpc_clnt; diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h deleted file mode 100644 index 292e4e38d17d..000000000000 --- a/include/linux/lockd/xdr.h +++ /dev/null @@ -1,113 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LOCKD_XDR_H -#define LOCKD_XDR_H - -#include -#include -#include -#include - -#define SM_MAXSTRLEN 1024 -#define SM_PRIV_SIZE 16 - -struct nsm_private { - unsigned char data[SM_PRIV_SIZE]; -}; - -struct svc_rqst; - -#define NLM_MAXCOOKIELEN 32 -#define NLM_MAXSTRLEN 1024 - -#define nlm_granted cpu_to_be32(NLM_LCK_GRANTED) -#define nlm_lck_denied cpu_to_be32(NLM_LCK_DENIED) -#define nlm_lck_denied_nolocks cpu_to_be32(NLM_LCK_DENIED_NOLOCKS) -#define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) -#define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) - -/* Lock info passed via NLM */ -struct nlm_lock { - char * caller; - unsigned int len; /* length of "caller" */ - struct nfs_fh fh; - struct xdr_netobj oh; - u32 svid; - u64 lock_start; - u64 lock_len; - struct file_lock fl; -}; - -/* - * NLM cookies. Technically they can be 1K, but Linux only uses 8 bytes. - * FreeBSD uses 16, Apple Mac OS X 10.3 uses 20. Therefore we set it to - * 32 bytes. - */ - -struct nlm_cookie -{ - unsigned char data[NLM_MAXCOOKIELEN]; - unsigned int len; -}; - -/* - * Generic lockd arguments for all but sm_notify - */ -struct nlm_args { - struct nlm_cookie cookie; - struct nlm_lock lock; - u32 block; - u32 reclaim; - u32 state; - u32 monitor; - u32 fsm_access; - u32 fsm_mode; -}; - -/* - * Generic lockd result - */ -struct nlm_res { - struct nlm_cookie cookie; - __be32 status; - struct nlm_lock lock; -}; - -/* - * statd callback when client has rebooted - */ -struct nlm_reboot { - char *mon; - unsigned int len; - u32 state; - struct nsm_private priv; -}; - -/* - * Contents of statd callback when monitored host rebooted - */ -#define NLMSVC_XDRSIZE sizeof(struct nlm_args) - -bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -#endif /* LOCKD_XDR_H */ -- cgit v1.2.3 From 5829352e568d24dd04ae112128a4f44748d073bc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:33 -0500 Subject: lockd: Make linux/lockd/nlm.h an internal header The NLM protocol constants and status codes in nlm.h are needed only by lockd's internal implementation. NFS client code and NFSD interact with lockd through the stable API in bind.h and have no direct use for protocol-level definitions. Exposing these definitions globally via bind.h creates unnecessary coupling between lockd internals and its consumers. Moving nlm.h from include/linux/lockd/ to fs/lockd/ clarifies the API boundary: bind.h provides the lockd service interface, while nlm.h remains available only to code within fs/lockd/ that implements the protocol. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/bind.h | 2 -- include/linux/lockd/nlm.h | 58 ---------------------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 include/linux/lockd/nlm.h (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index ba9258c96bfd..b614e0deea72 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -10,8 +10,6 @@ #ifndef LINUX_LOCKD_BIND_H #define LINUX_LOCKD_BIND_H -#include - struct file_lock; struct nfs_fh; struct svc_rqst; diff --git a/include/linux/lockd/nlm.h b/include/linux/lockd/nlm.h deleted file mode 100644 index 6e343ef760dc..000000000000 --- a/include/linux/lockd/nlm.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/nlm.h - * - * Declarations for the Network Lock Manager protocol. - * - * Copyright (C) 1996, Olaf Kirch - */ - -#ifndef LINUX_LOCKD_NLM_H -#define LINUX_LOCKD_NLM_H - - -/* Maximum file offset in file_lock.fl_end */ -# define NLM_OFFSET_MAX ((s32) 0x7fffffff) -# define NLM4_OFFSET_MAX ((s64) ((~(u64)0) >> 1)) - -/* Return states for NLM */ -enum { - NLM_LCK_GRANTED = 0, - NLM_LCK_DENIED = 1, - NLM_LCK_DENIED_NOLOCKS = 2, - NLM_LCK_BLOCKED = 3, - NLM_LCK_DENIED_GRACE_PERIOD = 4, -#ifdef CONFIG_LOCKD_V4 - NLM_DEADLCK = 5, - NLM_ROFS = 6, - NLM_STALE_FH = 7, - NLM_FBIG = 8, - NLM_FAILED = 9, -#endif -}; - -#define NLM_PROGRAM 100021 - -#define NLMPROC_NULL 0 -#define NLMPROC_TEST 1 -#define NLMPROC_LOCK 2 -#define NLMPROC_CANCEL 3 -#define NLMPROC_UNLOCK 4 -#define NLMPROC_GRANTED 5 -#define NLMPROC_TEST_MSG 6 -#define NLMPROC_LOCK_MSG 7 -#define NLMPROC_CANCEL_MSG 8 -#define NLMPROC_UNLOCK_MSG 9 -#define NLMPROC_GRANTED_MSG 10 -#define NLMPROC_TEST_RES 11 -#define NLMPROC_LOCK_RES 12 -#define NLMPROC_CANCEL_RES 13 -#define NLMPROC_UNLOCK_RES 14 -#define NLMPROC_GRANTED_RES 15 -#define NLMPROC_NSM_NOTIFY 16 /* statd callback */ -#define NLMPROC_SHARE 20 -#define NLMPROC_UNSHARE 21 -#define NLMPROC_NM_LOCK 22 -#define NLMPROC_FREE_ALL 23 - -#endif /* LINUX_LOCKD_NLM_H */ -- cgit v1.2.3 From adcc59114ccd402259c089b0fea24da5e4974563 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:50 +0100 Subject: sunrpc: Kill RPC_IFDEBUG() RPC_IFDEBUG() is used in only two places. In one the user of the definition is guarded by ifdeffery, in the second one it's implied due to dprintk() usage. Kill the macro and move the ifdeffery to the regular condition with the variable defined inside, while in the second case add the same conditional and move the respective code there. Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- include/linux/sunrpc/debug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index eb4bd62df319..93d1a11ffbfb 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -49,12 +49,10 @@ do { \ } \ } while (0) -# define RPC_IFDEBUG(x) x #else # define ifdebug(fac) if (0) # define dfprintk(fac, fmt, ...) do {} while (0) # define dfprintk_rcu(fac, fmt, ...) do {} while (0) -# define RPC_IFDEBUG(x) #endif /* -- cgit v1.2.3 From 6f57293abb8d087de830dd3f02e66d94b3e59973 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:51 +0100 Subject: sunrpc: Fix compilation error (`make W=1`) when dprintk() is no-op Clang compiler is not happy about set but unused variables: .../flexfilelayout/flexfilelayoutdev.c:56:9: error: variable 'ret' set but not used [-Werror,-Wunused-but-set-variable] .../flexfilelayout/flexfilelayout.c:1505:6: error: variable 'err' set but not used [-Werror,-Wunused-but-set-variable] .../nfs4proc.c:9244:12: error: variable 'ptr' set but not used [-Werror,-Wunused-but-set-variable] Fix these by forwarding parameters of dprintk() to no_printk(). The positive side-effect is a format-string checker enabled even for the cases when dprintk() is no-op. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Fixes: fc931582c260 ("nfs41: create_session operation") Acked-by: Geert Uytterhoeven Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- include/linux/sunrpc/debug.h | 8 ++++++-- include/linux/sunrpc/sched.h | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 93d1a11ffbfb..ab61bed2f7af 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -38,6 +38,8 @@ extern unsigned int nlm_debug; do { \ ifdebug(fac) \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ + else \ + no_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ @@ -46,13 +48,15 @@ do { \ rcu_read_lock(); \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ + } else { \ + no_printk(fmt, ##__VA_ARGS__); \ } \ } while (0) #else # define ifdebug(fac) if (0) -# define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_rcu(fac, fmt, ...) do {} while (0) +# define dfprintk(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define dfprintk_rcu(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ccba79ebf893..0dbdf3722537 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -95,10 +95,7 @@ struct rpc_task { int tk_rpc_status; /* Result of last RPC operation */ unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) unsigned short tk_pid; /* debugging aid */ -#endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, tk_cred_retry : 2; -- cgit v1.2.3 From f52792f484ba2316853736856dde19b7e7458861 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 13 Feb 2026 10:36:30 -0800 Subject: NFSD: Enforce timeout on layout recall and integrate lease manager fencing When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- include/linux/filelock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d2c9740e26a8..5f0a2fb31450 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -50,6 +50,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager { -- cgit v1.2.3 From 5bc37b759ec0cdde2c652a2637d704f2d6306617 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:53 -0500 Subject: Documentation: Add the RPC language description of NLM version 4 In order to generate source code to encode and decode NLMv4 protocol elements, include a copy of the RPC language description of NLMv4 for xdrgen to process. The language description is an amalgam of RFC 1813 and the Open Group's XNFS specification: https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm The C code committed here was generated from the new nlm4.x file using tools/net/sunrpc/xdrgen/xdrgen. The goals of replacing hand-written XDR functions with ones that are tool-generated are to improve memory safety and make XDR encoding and decoding less brittle to maintain. The xdrgen utility derives both the type definitions and the encode/decode functions directly from protocol specifications, using names and symbols familiar to anyone who knows those specs. Unlike hand-written code that can inadvertently diverge from the specification, xdrgen guarantees that the generated code matches the specification exactly. We would eventually like xdrgen to generate Rust code as well, making the conversion of the kernel's NFS stacks to use Rust just a little easier for us. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/nlm4.h | 233 +++++++++++++++++++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 include/linux/sunrpc/xdrgen/nlm4.h (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdrgen/nlm4.h b/include/linux/sunrpc/xdrgen/nlm4.h new file mode 100644 index 000000000000..e95e8f105624 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nlm4.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DEF_H +#define _LINUX_XDRGEN_NLM4_DEF_H + +#include +#include + +enum { LM_MAXSTRLEN = 1024 }; + +enum { LM_MAXNAMELEN = 1025 }; + +enum { MAXNETOBJ_SZ = 1024 }; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, + fsm_DR = 1, + fsm_DW = 2, + fsm_DRW = 3, +}; + +typedef enum fsh4_mode fsh4_mode; + +enum fsh4_access { + fsa_NONE = 0, + fsa_R = 1, + fsa_W = 2, + fsa_RW = 3, +}; + +typedef enum fsh4_access fsh4_access; + +enum { SM_MAXSTRLEN = 1024 }; + +typedef u64 uint64; + +typedef s64 int64; + +typedef u32 uint32; + +typedef s32 int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9, +}; + +typedef __be32 nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_testrply { + nlm4_stats stat; + union { + struct nlm4_holder holder; + } u; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + struct nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + struct nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + struct nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + struct nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +enum { SM_PRIV_SIZE = 16 }; + +struct nlm4_notifyargs { + struct nlm4_notify notify; + u8 private[SM_PRIV_SIZE]; +}; + +enum { + NLMPROC4_NULL = 0, + NLMPROC4_TEST = 1, + NLMPROC4_LOCK = 2, + NLMPROC4_CANCEL = 3, + NLMPROC4_UNLOCK = 4, + NLMPROC4_GRANTED = 5, + NLMPROC4_TEST_MSG = 6, + NLMPROC4_LOCK_MSG = 7, + NLMPROC4_CANCEL_MSG = 8, + NLMPROC4_UNLOCK_MSG = 9, + NLMPROC4_GRANTED_MSG = 10, + NLMPROC4_TEST_RES = 11, + NLMPROC4_LOCK_RES = 12, + NLMPROC4_CANCEL_RES = 13, + NLMPROC4_UNLOCK_RES = 14, + NLMPROC4_GRANTED_RES = 15, + NLMPROC4_SM_NOTIFY = 16, + NLMPROC4_SHARE = 20, + NLMPROC4_UNSHARE = 21, + NLMPROC4_NM_LOCK = 22, + NLMPROC4_FREE_ALL = 23, +}; + +#ifndef NLM4_PROG +#define NLM4_PROG (100021) +#endif + +#define NLM4_netobj_sz (XDR_unsigned_int + XDR_QUADLEN(MAXNETOBJ_SZ)) +#define NLM4_fsh4_mode_sz (XDR_int) +#define NLM4_fsh4_access_sz (XDR_int) +#define NLM4_uint64_sz \ + (XDR_unsigned_hyper) +#define NLM4_int64_sz \ + (XDR_hyper) +#define NLM4_uint32_sz \ + (XDR_unsigned_long) +#define NLM4_int32_sz \ + (XDR_long) +#define NLM4_nlm4_stats_sz (XDR_int) +#define NLM4_nlm4_holder_sz \ + (XDR_bool + NLM4_int32_sz + NLM4_netobj_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_testrply_sz \ + (NLM4_nlm4_stats_sz + NLM4_nlm4_holder_sz) +#define NLM4_nlm4_stat_sz \ + (NLM4_nlm4_stats_sz) +#define NLM4_nlm4_res_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stat_sz) +#define NLM4_nlm4_testres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_testrply_sz) +#define NLM4_nlm4_lock_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_int32_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_lockargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz + XDR_bool + NLM4_int32_sz) +#define NLM4_nlm4_cancargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_testargs_sz \ + (NLM4_netobj_sz + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_unlockargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_share_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_fsh4_mode_sz + NLM4_fsh4_access_sz) +#define NLM4_nlm4_shareargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_share_sz + XDR_bool) +#define NLM4_nlm4_shareres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stats_sz + NLM4_int32_sz) +#define NLM4_nlm4_notify_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXNAMELEN) + NLM4_int32_sz) +#define NLM4_nlm4_notifyargs_sz \ + (NLM4_nlm4_notify_sz + XDR_QUADLEN(SM_PRIV_SIZE)) +#define NLM4_MAX_ARGS_SZ \ + (NLM4_nlm4_lockargs_sz) + +#endif /* _LINUX_XDRGEN_NLM4_DEF_H */ -- cgit v1.2.3 From 17c1d66579ff27a7a8f2f407d1425272ff6fdd8c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:09:59 -0500 Subject: sunrpc: convert queue_lock from global spinlock to per-cache-detail lock The global queue_lock serializes all upcall queue operations across every cache_detail instance. Convert it to a per-cache-detail spinlock so that different caches (e.g. auth.unix.ip vs nfsd.fh) no longer contend with each other on queue operations. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index e783132e481f..3d32dd1f7b05 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,6 +113,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; + spinlock_t queue_lock; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ -- cgit v1.2.3 From 552d0e17ea042fc4f959c4543cbbd0e54de7a8e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:00 -0500 Subject: sunrpc: convert queue_wait from global to per-cache-detail waitqueue The queue_wait waitqueue is currently a file-scoped global, so a wake_up for one cache_detail wakes pollers on all caches. Convert it to a per-cache-detail field so that only pollers on the relevant cache are woken. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 3d32dd1f7b05..031379efba24 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * Each cache requires: @@ -114,6 +115,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; spinlock_t queue_lock; + wait_queue_head_t queue_wait; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ -- cgit v1.2.3 From facc4e3c80420e3466003ce09b576e005b56a015 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:01 -0500 Subject: sunrpc: split cache_detail queue into request and reader lists Replace the single interleaved queue (which mixed cache_request and cache_reader entries distinguished by a ->reader flag) with two dedicated lists: cd->requests for upcall requests and cd->readers for open file handles. Readers now track their position via a monotonically increasing sequence number (next_seqno) rather than by their position in the shared list. Each cache_request is assigned a seqno when enqueued, and a new cache_next_request() helper finds the next request at or after a given seqno. This eliminates the cache_queue wrapper struct entirely, simplifies the reader-skipping loops in cache_read/cache_poll/cache_ioctl/ cache_release, and makes the data flow easier to reason about. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 031379efba24..b1e595c2615b 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,9 +113,11 @@ struct cache_detail { int entries; /* fields for communication over channel */ - struct list_head queue; + struct list_head requests; + struct list_head readers; spinlock_t queue_lock; wait_queue_head_t queue_wait; + u64 next_seqno; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ -- cgit v1.2.3 From ee66b9e3e1c69efc986f3932555f07121c3460a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:35 -0500 Subject: SUNRPC: Allocate a separate Reply page array struct svc_rqst uses a single dynamically-allocated page array (rq_pages) for both the incoming RPC Call message and the outgoing RPC Reply message. rq_respages is a sliding pointer into rq_pages that each transport receive path must compute based on how many pages the Call consumed. This boundary tracking is a source of confusion and bugs, and prevents an RPC transaction from having both a large Call and a large Reply simultaneously. Allocate rq_respages as its own page array, eliminating the boundary arithmetic. This decouples Call and Reply buffer lifetimes, following the precedent set by rq_bvec (a separate dynamically- allocated array for I/O vectors). Each svc_rqst now pins twice as many pages as before. For a server running 16 threads with a 1MB maximum payload, the additional cost is roughly 16MB of pinned memory. The new dynamic svc thread count facility keeps this overhead minimal on an idle server. A subsequent patch in this series limits per-request repopulation to only the pages released during the previous RPC, avoiding a full-array scan on each call to svc_alloc_arg(). Note: We've considered several alternatives to maintaining a full second array. Each alternative reintroduces either boundary logic complexity or I/O-path allocation pressure. rq_next_page is initialized in svc_alloc_arg() and svc_process() during Reply construction, and in svc_rdma_recvfrom() as a precaution on error paths. Transport receive paths no longer compute it from the Call size. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 47 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 62152e4f3bcc..3b1a98ab5cba 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -134,25 +134,24 @@ enum { extern u32 svc_max_payload(const struct svc_rqst *rqstp); /* - * RPC Requests and replies are stored in one or more pages. - * We maintain an array of pages for each server thread. - * Requests are copied into these pages as they arrive. Remaining - * pages are available to write the reply into. + * RPC Call and Reply messages each have their own page array. + * rq_pages holds the incoming Call message; rq_respages holds + * the outgoing Reply message. Both arrays are sized to + * svc_serv_maxpages() entries and are allocated dynamically. * - * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread - * needs to allocate more to replace those used in sending. To help keep track - * of these pages we have a receive list where all pages initialy live, and a - * send list where pages are moved to when there are to be part of a reply. + * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each + * server thread needs to allocate more to replace those used in + * sending. * - * We use xdr_buf for holding responses as it fits well with NFS - * read responses (that have a header, and some data pages, and possibly - * a tail) and means we can share some client side routines. + * xdr_buf holds responses; the structure fits NFS read responses + * (header, data pages, optional tail) and enables sharing of + * client-side routines. * - * The xdr_buf.head kvec always points to the first page in the rq_*pages - * list. The xdr_buf.pages pointer points to the second page on that - * list. xdr_buf.tail points to the end of the first page. - * This assumes that the non-page part of an rpc reply will fit - * in a page - NFSd ensures this. lockd also has no trouble. + * The xdr_buf.head kvec always points to the first page in the + * rq_*pages list. The xdr_buf.pages pointer points to the second + * page on that list. xdr_buf.tail points to the end of the first + * page. This assumes that the non-page part of an rpc reply will + * fit in a page - NFSd ensures this. lockd also has no trouble. */ /** @@ -162,10 +161,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * Returns a count of pages or vectors that can hold the maximum * size RPC message for @serv. * - * Each request/reply pair can have at most one "payload", plus two - * pages, one for the request, and one for the reply. - * nfsd_splice_actor() might need an extra page when a READ payload - * is not page-aligned. + * Each page array can hold at most one payload plus two + * overhead pages (one for the RPC header, one for tail data). + * nfsd_splice_actor() might need an extra page when a READ + * payload is not page-aligned. */ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) { @@ -204,11 +203,11 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct folio *rq_scratch_folio; struct xdr_buf rq_res; - unsigned long rq_maxpages; /* num of entries in rq_pages */ - struct page * *rq_pages; - struct page * *rq_respages; /* points into rq_pages */ + unsigned long rq_maxpages; /* entries per page array */ + struct page * *rq_pages; /* Call buffer pages */ + struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ - struct page * *rq_page_end; /* one past the last page */ + struct page * *rq_page_end; /* one past the last reply page */ struct folio_batch rq_fbatch; struct bio_vec *rq_bvec; -- cgit v1.2.3 From 7ed7504287a627834f2a35ef04e5dfd26d1c8986 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:38 -0500 Subject: SUNRPC: Track consumed rq_pages entries The rq_pages array holds pages allocated for incoming RPC requests. Two transport receive paths NULL entries in rq_pages to prevent svc_rqst_release_pages() from freeing pages that the transport has taken ownership of: - svc_tcp_save_pages() moves partial request data pages to svsk->sk_pages during multi-fragment TCP reassembly. - svc_rdma_clear_rqst_pages() moves request data pages to head->rc_pages because they are targets of active RDMA Read WRs. A new rq_pages_nfree field in struct svc_rqst records how many entries were NULLed. svc_alloc_arg() uses it to refill only those entries rather than scanning the full rq_pages array. In steady state, the transport NULLs a handful of entries per RPC, so the allocator visits only those entries instead of the full ~259 slots (for 1MB messages). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3b1a98ab5cba..c3399cf64524 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -143,6 +143,15 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * server thread needs to allocate more to replace those used in * sending. * + * rq_pages request page contract: + * + * Transport receive paths that move request data pages out of + * rq_pages -- TCP multi-fragment reassembly (svc_tcp_save_pages) + * and RDMA Read I/O (svc_rdma_clear_rqst_pages) -- NULL those + * entries to prevent svc_rqst_release_pages() from freeing pages + * still in transport use, and set rq_pages_nfree to the count. + * svc_alloc_arg() refills only that many rq_pages entries. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. @@ -204,6 +213,7 @@ struct svc_rqst { struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* entries per page array */ + unsigned long rq_pages_nfree; /* rq_pages entries NULLed by transport */ struct page * *rq_pages; /* Call buffer pages */ struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ -- cgit v1.2.3 From d7f3efd9ff474867b04e1ea784690f02450a245b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:39 -0500 Subject: SUNRPC: Optimize rq_respages allocation in svc_alloc_arg svc_alloc_arg() invokes alloc_pages_bulk() with the full rq_maxpages count (~259 for 1MB messages) for the rq_respages array, causing a full-array scan despite most slots holding valid pages. svc_rqst_release_pages() NULLs only the range [rq_respages, rq_next_page) after each RPC, so only that range contains NULL entries. Limit the rq_respages fill in svc_alloc_arg() to that range instead of scanning the full array. svc_init_buffer() initializes rq_next_page to span the entire rq_respages array, so the first svc_alloc_arg() call fills all slots. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index c3399cf64524..669c944eaf7f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -152,6 +152,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * still in transport use, and set rq_pages_nfree to the count. * svc_alloc_arg() refills only that many rq_pages entries. * + * For rq_respages, svc_rqst_release_pages() NULLs entries in + * [rq_respages, rq_next_page) after each RPC. svc_alloc_arg() + * refills only that range. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. -- cgit v1.2.3 From ccc89b9d1ed233349cfe8d87b842e7351b74d8de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:28 -0500 Subject: svcrdma: Add fair queuing for Send Queue access When the Send Queue fills, multiple threads may wait for SQ slots. The previous implementation had no ordering guarantee, allowing starvation when one thread repeatedly acquires slots while others wait indefinitely. Introduce a ticket-based fair queuing system. Each waiter takes a ticket number and is served in FIFO order. This ensures forward progress for all waiters when SQ capacity is constrained. The implementation has two phases: 1. Fast path: attempt to reserve SQ slots without waiting 2. Slow path: take a ticket, wait for turn, then wait for slots The ticket system adds two atomic counters to the transport: - sc_sq_ticket_head: next ticket to issue - sc_sq_ticket_tail: ticket currently being served A dedicated wait queue (sc_sq_ticket_wait) handles ticket ordering, separate from sc_send_wait which handles SQ capacity. This separation ensures that send completions (the high-frequency wake source) wake only the current ticket holder rather than all queued waiters. Ticket handoff wakes only the ticket wait queue, and each ticket holder that exits via connection close propagates the wake to the next waiter in line. When a waiter successfully reserves slots, it advances the tail counter and wakes the next waiter. This creates an orderly handoff that prevents starvation while maintaining good throughput on the fast path when contention is low. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57f4fd94166a..658b8498177e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -84,6 +84,9 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_ticket_head; /* Next ticket to issue */ + atomic_t sc_sq_ticket_tail; /* Ticket currently serving */ + wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */ __be32 sc_fc_credits; /* Forward credits */ u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ @@ -306,6 +309,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *rctxt, int status); extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); +extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount); +extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); -- cgit v1.2.3 From d16f060f3ee297424c0aba047b1d49208adb9318 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:31 -0500 Subject: svcrdma: Add Write chunk WRs to the RPC's Send WR chain Previously, Write chunk RDMA Writes were posted via a separate ib_post_send() call with their own completion handler. Each Write chunk incurred a doorbell and generated a completion event. Link Write chunk WRs onto the RPC Reply's Send WR chain so that a single ib_post_send() call posts both the RDMA Writes and the Send WR. A single completion event signals that all operations have finished. This reduces both doorbell rate and completion rate, as well as eliminating the latency of a round-trip between the Write chunk completion and the subsequent Send WR posting. The lifecycle of Write chunk resources changes: previously, the svc_rdma_write_done() completion handler released Write chunk resources when RDMA Writes completed. With WR chaining, resources remain live until the Send completion. A new sc_write_info_list tracks Write chunk metadata attached to each Send context, and svc_rdma_write_chunk_release() frees these resources when the Send context is released. The svc_rdma_write_done() handler now handles only error cases. On success it returns immediately since the Send completion handles resource release. On failure (WR flush), it closes the connection to signal to the client that the RPC Reply is incomplete. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 658b8498177e..df6e08aaad57 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -216,6 +216,7 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -244,7 +245,10 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -277,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, -- cgit v1.2.3 From 3603bf99062c6d563df4fba3848f829d5401d959 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:09:22 -0800 Subject: SUNRPC: xdr.h: fix all kernel-doc warnings Correct a function parameter name (s/page/folio/) and add function return value sections for multiple functions to eliminate kernel-doc warnings: Warning: include/linux/sunrpc/xdr.h:298 function parameter 'folio' not described in 'xdr_set_scratch_folio' Warning: include/linux/sunrpc/xdr.h:337 No description found for return value of 'xdr_stream_remaining' Warning: include/linux/sunrpc/xdr.h:357 No description found for return value of 'xdr_align_size' Warning: include/linux/sunrpc/xdr.h:374 No description found for return value of 'xdr_pad_size' Warning: include/linux/sunrpc/xdr.h:387 No description found for return value of 'xdr_stream_encode_item_present' Signed-off-by: Randy Dunlap Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 152597750f55..b639a6fafcbc 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -290,7 +290,7 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct - * @page: an anonymous folio + * @folio: an anonymous folio * * See xdr_set_scratch_buffer(). */ @@ -330,7 +330,7 @@ static inline void xdr_commit_encode(struct xdr_stream *xdr) * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * - * Return value: + * Returns: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t @@ -350,7 +350,7 @@ ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * - * Return value: + * Returns: * Size (in bytes) of the object including xdr padding */ static inline size_t @@ -368,7 +368,7 @@ xdr_align_size(size_t n) * This implementation avoids the need for conditional * branches or modulo division. * - * Return value: + * Returns: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) @@ -380,7 +380,7 @@ static inline size_t xdr_pad_size(size_t n) * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -399,7 +399,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -419,7 +419,7 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) * @p: address in a buffer into which to encode * @n: boolean value to encode * - * Return value: + * Returns: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) @@ -433,7 +433,7 @@ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) * @xdr: pointer to xdr_stream * @n: boolean value to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -453,7 +453,7 @@ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -474,7 +474,7 @@ xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -495,7 +495,7 @@ xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -517,7 +517,7 @@ xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) * @ptr: pointer to void pointer * @len: size of object * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -542,7 +542,7 @@ xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -563,7 +563,7 @@ xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t l * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -585,7 +585,7 @@ xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) * @array: array of integers * @array_size: number of elements in @array * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -608,7 +608,7 @@ xdr_stream_encode_uint32_array(struct xdr_stream *xdr, * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is absent * %false if the following XDR item is present */ @@ -621,7 +621,7 @@ static inline bool xdr_item_is_absent(const __be32 *p) * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is present * %false if the following XDR item is absent */ @@ -635,7 +635,7 @@ static inline bool xdr_item_is_present(const __be32 *p) * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -656,7 +656,7 @@ xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -677,7 +677,7 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -698,7 +698,7 @@ xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -720,7 +720,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @ptr: location to store data * @len: size of buffer pointed to by @ptr * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -746,7 +746,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * - * Return values: + * Returns: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen @@ -777,7 +777,7 @@ xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxle * @array: location to store the integer array or NULL * @array_size: number of elements to store * - * Return values: + * Returns: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size -- cgit v1.2.3 From 4e2866b2baaddfff6069a2f18fc134c1d5a08f2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2026 12:18:54 -0400 Subject: SUNRPC: Add svc_rqst_page_release() helper svc_rqst_replace_page() releases displaced pages through a per-rqst folio batch, but exposes the add-or-flush sequence directly. svc_tcp_restore_pages() releases displaced pages individually with put_page(). Introduce svc_rqst_page_release() to encapsulate the batched release mechanism. Convert svc_rqst_replace_page() and svc_tcp_restore_pages() to use it. The latter now benefits from the same batched release that svc_rqst_replace_page() already uses. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 669c944eaf7f..1ebd9c7efa70 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -498,6 +498,21 @@ int svc_generic_rpcbind_set(struct net *net, #define RPC_MAX_ADDRBUFLEN (63U) +/** + * svc_rqst_page_release - release a page associated with an RPC transaction + * @rqstp: RPC transaction context + * @page: page to release + * + * Released pages are batched and freed together, reducing + * allocator pressure under heavy RPC workloads. + */ +static inline void svc_rqst_page_release(struct svc_rqst *rqstp, + struct page *page) +{ + if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(page))) + __folio_batch_release(&rqstp->rq_fbatch); +} + /* * When we want to reduce the size of the reserved space in the response * buffer, we need to take into account the size of any checksum data that -- cgit v1.2.3