From 1fa4e69a54a250fa17d2afd9c5b54a59329033c1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:36 +0100 Subject: filelock: use a consume fence in locks_inode_context() Matches the idiom of storing a pointer with a release fence and safely getting the content with a consume fence after. Eliminates an actual fence on some archs. Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/filelock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 54b824c05299..dc15f5427680 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -241,7 +241,10 @@ bool locks_owner_has_blockers(struct file_lock_context *flctx, static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { - return smp_load_acquire(&inode->i_flctx); + /* + * Paired with the fence in locks_get_lock_context(). + */ + return READ_ONCE(inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ -- cgit v1.2.3 From 887e97745ec336c2f49b6c0af3c4cc00a5df3211 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:37 +0100 Subject: fs: track the inode having file locks with a flag in ->i_opflags Opening and closing an inode dirties the ->i_readcount field. Depending on the alignment of the inode, it may happen to false-share with other fields loaded both for both operations to various extent. This notably concerns the ->i_flctx field. Since most inodes don't have the field populated, this bit can be managed with a flag in ->i_opflags instead which bypasses the problem. Here are results I obtained while opening a file read-only in a loop with 24 cores doing the work on Sapphire Rapids. Utilizing the flag as opposed to reading ->i_flctx field was toggled at runtime as the benchmark was running, to make sure both results come from the same alignment. before: 3233740 after: 3373346 (+4%) before: 3284313 after: 3518711 (+7%) before: 3505545 after: 4092806 (+16%) Or to put it differently, this varies wildly depending on how (un)lucky you get. The primary bottleneck before and after is the avoidable lockref trip in do_dentry_open(). Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/filelock.h | 15 +++++++++++---- include/linux/fs.h | 1 + 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index dc15f5427680..4a8912b9653e 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -242,8 +242,12 @@ static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { /* - * Paired with the fence in locks_get_lock_context(). + * Paired with smp_store_release in locks_get_lock_context(). + * + * Ensures ->i_flctx will be visible if we spotted the flag. */ + if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX))) + return NULL; return READ_ONCE(inode->i_flctx); } @@ -471,7 +475,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -490,7 +494,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -535,8 +539,11 @@ static inline int break_deleg_wait(struct delegated_inode *di) static inline int break_layout(struct inode *inode, bool wait) { + struct file_lock_context *flctx; + smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + flctx = locks_inode_context(inode); + if (flctx && !list_empty_careful(&flctx->flc_lease)) { unsigned int flags = LEASE_BREAK_LAYOUT; if (!wait) diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..094b0adcb035 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 +#define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock -- cgit v1.2.3 From c0aac5975bafc86f6817b14e9f71dcb5064a9183 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:28:50 +0100 Subject: ns: pad refcount Note no effort is made to make sure structs embedding the namespace are themselves aligned, so this is not guaranteed to eliminate cacheline bouncing due to refcount management. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203092851.287617-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index b332b019b29c..0014fbc1c626 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -108,11 +108,13 @@ extern const struct proc_ns_operations utsns_operations; * @ns_tree: namespace tree nodes and active reference count */ struct ns_common { + struct { + refcount_t __ns_ref; /* do not use directly */ + } ____cacheline_aligned_in_smp; u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ union { struct ns_tree; struct rcu_head ns_rcu; -- cgit v1.2.3 From 0f166bf1d6d82701cc1d94445cc2a9107d1790df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 23 Dec 2025 08:00:39 +0100 Subject: select: store end_time as timespec64 in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Storing the end time seconds as 'unsigned long' can lead to truncation on 32-bit architectures if assigned from the 64-bit timespec64::tv_sec. As the select() core uses timespec64 consistently, also use that in the restart block. This also allows the simplification of the accessors. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251223-restart-block-expiration-v2-1-8e33e5df7359@linutronix.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/restart_block.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 67d2bf579942..9b262109726d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -6,6 +6,7 @@ #define __LINUX_RESTART_BLOCK_H #include +#include #include struct __kernel_timespec; @@ -50,8 +51,7 @@ struct restart_block { struct pollfd __user *ufds; int nfds; int has_timeout; - unsigned long tv_sec; - unsigned long tv_nsec; + struct timespec64 end_time; } poll; }; }; -- cgit v1.2.3 From 6784f274722559c0cdaaa418bc8b7b1d61c314f9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 7 Jan 2026 06:06:36 -0800 Subject: device_cgroup: remove branch hint after code refactor commit 4ef4ac360101 ("device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()") reordered the checks in devcgroup_inode_permission() to check the inode mode before checking i_rdev, for better cache behavior. However, the likely() annotation on the i_rdev check was not updated to reflect the new code flow. Originally, when i_rdev was checked first, likely(!inode->i_rdev) made sense because most inodes were(?) regular files/directories, thus i_rdev == 0. After the reorder, by the time we reach the i_rdev check, we have already confirmed the inode IS a block or character device. Block and character special files are precisely defined by having a device number (i_rdev), so !inode->i_rdev is now the rare edge case, not the common case. Branch profiling confirmed this is 100% mispredicted: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 2631904 100 devcgroup_inode_permission device_cgroup.h 24 Remove likely() to avoid giving the wrong hint to the CPU. Fixes: 4ef4ac360101 ("device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260107-likely_device-v1-1-0c55f83a7e47@debian.org Reviewed-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/device_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 0864773a57e8..822085bc2d20 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -21,7 +21,7 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask) if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; - if (likely(!inode->i_rdev)) + if (!inode->i_rdev) return 0; if (S_ISBLK(inode->i_mode)) -- cgit v1.2.3 From 5e7fa6bfa9b5ced6868fc652d5c40fe0eac154d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:24 -0300 Subject: exportfs: Fix kernel-doc output for get_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without a space between %NAME_MAX and the plus sign, kernel-doc will output ``NAME_MAX``+1, which scapes the last backtick and make Sphinx format a much larger string as monospaced text. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-1-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index f0cf2714ec52..599ea86363e1 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -234,7 +234,7 @@ struct handle_to_path_ctx { * get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the - * understanding that it is already pointing to a %NAME_MAX+1 sized + * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * -- cgit v1.2.3 From fc76b5968a435894062ad4160c2e81c32cc4972e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:25 -0300 Subject: exportfs: Mark struct export_operations functions at kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding a `@` before the function names make then recognizable as kernel-docs, so they get correctly rendered in the documentation. Even if they are already marked with `@` in the short one-line summary, the kernel-docs will correctly favor the more detailed definition here. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-2-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 599ea86363e1..bed370b9f906 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -203,7 +203,7 @@ struct handle_to_path_ctx { * See Documentation/filesystems/nfs/exporting.rst for details on how to use * this interface correctly. * - * encode_fh: + * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit @@ -215,7 +215,7 @@ struct handle_to_path_ctx { * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * - * fh_to_dentry: + * @fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, @@ -227,29 +227,29 @@ struct handle_to_path_ctx { * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * - * fh_to_parent: + * @fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * - * get_name: + * @get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * - * get_parent: + * @get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * - * permission: + * @permission: * Allow filesystems to specify a custom permission function. * - * open: + * @open: * Allow filesystems to specify a custom open function. * - * commit_metadata: + * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: -- cgit v1.2.3 From 7a6f811e2c06d656996776771f0498df129a0cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:26 -0300 Subject: exportfs: Complete kernel-doc for struct export_operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Write down the missing members definitions for struct export_operations, using as a reference the commit messages that created the members. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-3-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index bed370b9f906..262e24d83313 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -201,7 +201,7 @@ struct handle_to_path_ctx { * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use - * this interface correctly. + * this interface correctly and the definition of the flags. * * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most @@ -252,6 +252,19 @@ struct handle_to_path_ctx { * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * + * @get_uuid: + * Get a filesystem unique signature exposed to clients. + * + * @map_blocks: + * Map and, if necessary, allocate blocks for a layout. + * + * @commit_blocks: + * Commit blocks in a layout once the client is done with them. + * + * @flags: + * Allows the filesystem to communicate to nfsd that it may want to do things + * differently when dealing with it. + * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) -- cgit v1.2.3 From 6cbfdf89470ef3c2110f376a507d135e7a7a7378 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 15 Jan 2026 13:23:40 +0100 Subject: posix_acl: make posix_acl_to_xattr() alloc the buffer Without exception all caller do that. So move the allocation into the helper. This reduces boilerplate and removes unnecessary error checking. Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260115122341.556026-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- include/linux/posix_acl_xattr.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index e86f3b731da2..9e1892525eac 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -44,8 +44,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, } #endif -int posix_acl_to_xattr(struct user_namespace *user_ns, - const struct posix_acl *acl, void *buffer, size_t size); +extern void *posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + size_t *sizep, gfp_t gfp); + static inline const char *posix_acl_xattr_name(int type) { switch (type) { -- cgit v1.2.3