From 64a989dbd144e0622371396461b11335459692d2 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 27 Nov 2025 11:02:05 -0500 Subject: VFS/knfsd: Teach dentry_create() to use atomic_open() While knfsd offers combined exclusive create and open results to clients, on some filesystems those results may not be atomic. This behavior can be observed. For example, an open O_CREAT with mode 0 will succeed in creating the file but unexpectedly return -EACCES from vfs_open(). Additionally reducing the number of remote RPC calls required for O_CREAT on network filesystem provides a performance benefit in the open path. Teach knfsd's helper dentry_create() to use atomic_open() for filesystems that support it. The previously const @path is passed up to atomic_open() and may be modified depending on whether an existing entry was found or if the atomic_open() returned an error and consumed the passed-in dentry. Signed-off-by: Benjamin Coddington Link: https://patch.msgid.link/8e449bfb64ab055abb9fd82641a171531415a88c.1764259052.git.bcodding@hammerspace.com Reviewed-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..1cb3385ee852 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2457,7 +2457,7 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); -struct file *dentry_create(const struct path *path, int flags, umode_t mode, +struct file *dentry_create(struct path *path, int flags, umode_t mode, const struct cred *cred); const struct path *backing_file_user_path(const struct file *f); -- cgit v1.2.3 From 887e97745ec336c2f49b6c0af3c4cc00a5df3211 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:37 +0100 Subject: fs: track the inode having file locks with a flag in ->i_opflags Opening and closing an inode dirties the ->i_readcount field. Depending on the alignment of the inode, it may happen to false-share with other fields loaded both for both operations to various extent. This notably concerns the ->i_flctx field. Since most inodes don't have the field populated, this bit can be managed with a flag in ->i_opflags instead which bypasses the problem. Here are results I obtained while opening a file read-only in a loop with 24 cores doing the work on Sapphire Rapids. Utilizing the flag as opposed to reading ->i_flctx field was toggled at runtime as the benchmark was running, to make sure both results come from the same alignment. before: 3233740 after: 3373346 (+4%) before: 3284313 after: 3518711 (+7%) before: 3505545 after: 4092806 (+16%) Or to put it differently, this varies wildly depending on how (un)lucky you get. The primary bottleneck before and after is the avoidable lockref trip in do_dentry_open(). Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..094b0adcb035 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 +#define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock -- cgit v1.2.3 From 51a146e0595c638c58097a1660ff6b6e7d3b72f3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 12 Dec 2025 11:44:03 -0600 Subject: fs: Remove internal old mount API code Now that the last in-tree filesystem has been converted to the new mount API, remove all legacy mount API code designed to handle un-converted filesystems, and remove associated documentation as well. (The code to handle the legacy mount(2) syscall from userspace is still in place, of course.) Tested with an allmodconfig build on x86_64, and a sanity check of an old mount(2) syscall mount. Signed-off-by: Eric Sandeen Link: https://patch.msgid.link/20251212174403.2882183-1-sandeen@redhat.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..9949d253e5aa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2274,8 +2274,6 @@ struct file_system_type { #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; -- cgit v1.2.3 From f7386f545e49e5e6229a14d92b39340d155b0b3f Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 3 Nov 2025 22:29:08 +0100 Subject: sysctl: Remove unused ctl_table forward declarations Remove superfluous forward declarations of ctl_table from header files where they are no longer needed. These declarations were left behind after sysctl code refactoring and cleanup. Reviewed-by: Jan Kara Acked-by: Muchun Song Reviewed-by: Petr Mladek Acked-by: Paolo Abeni Signed-off-by: Joel Granados --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..77f6302fdced 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3487,7 +3487,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); -struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -- cgit v1.2.3 From 51e49111c00bee76ca403adf7cd617b71a9a0da4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 8 Jan 2026 12:13:19 -0500 Subject: fs: remove simple_nosetlease() Setting ->setlease() to a NULL pointer now has the same effect as setting it to simple_nosetlease(). Remove all of the setlease file_operations that are set to simple_nosetlease, and the function itself. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260108-setlease-6-20-v1-24-ea4dec9b67fa@kernel.org Acked-by: Al Viro Acked-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f5c9cf28c4dc..e46e8aad9339 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3217,7 +3217,6 @@ extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name, const struct inode *context_inode); -extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); -- cgit v1.2.3 From c644bce62b9c6b441143a03c910f986109c47001 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 8 Jan 2026 08:45:22 +0100 Subject: readdir: require opt-in for d_type flags Commit c31f91c6af96 ("fuse: don't allow signals to interrupt getdents copying") introduced the use of high bits in d_type as flags. However, overlayfs was not adapted to handle this change. In ovl_cache_entry_new(), the code checks if d_type == DT_CHR to determine if an entry might be a whiteout. When fuse is used as the lower layer and sets high bits in d_type, this comparison fails, causing whiteout files to not be recognized properly and resulting in incorrect overlayfs behavior. Fix this by requiring callers of iterate_dir() to opt-in for getting flag bits in d_type outside of S_DT_MASK. Fixes: c31f91c6af96 ("fuse: don't allow signals to interrupt getdents copying") Link: https://lore.kernel.org/all/20260107034551.439-1-luochunsheng@ustc.edu/ Link: https://github.com/containerd/stargz-snapshotter/issues/2214 Reported-by: Chunsheng Luo Reviewed-by: Chunsheng Luo Tested-by: Chunsheng Luo Signed-off-by: Amir Goldstein Link: https://patch.msgid.link/20260108074522.3400998-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f5c9cf28c4dc..a01621fa636a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1855,6 +1855,8 @@ struct dir_context { * INT_MAX unlimited */ int count; + /* @actor supports these flags in d_type high bits */ + unsigned int dt_flags_mask; }; /* If OR-ed with d_type, pending signals are not checked */ @@ -3524,7 +3526,9 @@ static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { - return ctx->actor(ctx, name, namelen, ctx->pos, ino, type); + unsigned int dt_mask = S_DT_MASK | ctx->dt_flags_mask; + + return ctx->actor(ctx, name, namelen, ctx->pos, ino, type & dt_mask); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { -- cgit v1.2.3 From 20b781834ea0037b63c657e15b5aa4cfb4dd9b8b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jan 2026 15:19:01 +0100 Subject: fs: remove inode_update_time The only external user is gone now, open code it in the two VFS callers. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260108141934.2052404-2-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..294e4c0b5aa8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2246,7 +2246,6 @@ enum file_time_flags { extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); -int inode_update_time(struct inode *inode, int flags); static inline void file_accessed(struct file *file) { -- cgit v1.2.3 From dc9629faef0a3d3cd35aff22806376700275a8b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jan 2026 15:19:02 +0100 Subject: fs: allow error returns from generic_update_time Now that no caller looks at the updated flags, switch generic_update_time to the same calling convention as the ->update_time method and return 0 or a negative errno. This prepares for adding non-blocking timestamp updates that could return -EAGAIN. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260108141934.2052404-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 294e4c0b5aa8..0983df0d0705 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2399,7 +2399,7 @@ extern void ihold(struct inode * inode); extern void iput(struct inode *); void iput_not_last(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); -int generic_update_time(struct inode *, int); +int generic_update_time(struct inode *inode, int flags); /* /sys/fs */ extern struct kobject *fs_kobj; -- cgit v1.2.3 From 761475268fa8e322fe6b80bcf557dc65517df71e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jan 2026 15:19:05 +0100 Subject: fs: refactor ->update_time handling Pass the type of update (atime vs c/mtime plus version) as an enum instead of a set of flags that caused all kinds of confusion. Because inode_update_timestamps now can't return a modified version of those flags, return the I_DIRTY_* flags needed to persist the update, which is what the main caller in generic_update_time wants anyway, and which is suitable for the other callers that only want to know if an update happened. The whole update_time path keeps the flags argument, which will be used to support non-blocking updates soon even if it is unused, and (the slightly renamed) inode_update_time also gains the possibility to return a negative errno to support this. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260108141934.2052404-6-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0983df0d0705..77985b4ed6ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1717,6 +1717,13 @@ static inline struct timespec64 inode_set_ctime(struct inode *inode, struct timespec64 simple_inode_init_ts(struct inode *inode); +static inline int inode_time_dirty_flag(struct inode *inode) +{ + if (inode->i_sb->s_flags & SB_LAZYTIME) + return I_DIRTY_TIME; + return I_DIRTY_SYNC; +} + /* * Snapshotting support. */ @@ -1983,6 +1990,11 @@ int wrap_directory_iterator(struct file *, struct dir_context *, static int shared_##x(struct file *file , struct dir_context *ctx) \ { return wrap_directory_iterator(file, ctx, x); } +enum fs_update_time { + FS_UPD_ATIME, + FS_UPD_CMTIME, +}; + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); @@ -2010,7 +2022,8 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); - int (*update_time)(struct inode *, int); + int (*update_time)(struct inode *inode, enum fs_update_time type, + unsigned int flags); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); @@ -2237,13 +2250,6 @@ static inline void inode_dec_link_count(struct inode *inode) mark_inode_dirty(inode); } -enum file_time_flags { - S_ATIME = 1, - S_MTIME = 2, - S_CTIME = 4, - S_VERSION = 8, -}; - extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); @@ -2398,8 +2404,10 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch extern void ihold(struct inode * inode); extern void iput(struct inode *); void iput_not_last(struct inode *); -int inode_update_timestamps(struct inode *inode, int flags); -int generic_update_time(struct inode *inode, int flags); +int inode_update_time(struct inode *inode, enum fs_update_time type, + unsigned int flags); +int generic_update_time(struct inode *inode, enum fs_update_time type, + unsigned int flags); /* /sys/fs */ extern struct kobject *fs_kobj; -- cgit v1.2.3 From 5cf06ea56ee67209d4e9a0b381641fb062ecd2c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jan 2026 15:19:07 +0100 Subject: fs: add a ->sync_lazytime method Allow the file system to explicitly implement lazytime syncing instead of pigging back on generic inode dirtying. This allows to simplify the XFS implementation and prepares for non-blocking lazytime timestamp updates. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260108141934.2052404-8-hch@lst.de Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 77985b4ed6ff..9cce8b9a29ac 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2024,6 +2024,7 @@ struct inode_operations { u64 len); int (*update_time)(struct inode *inode, enum fs_update_time type, unsigned int flags); + void (*sync_lazytime)(struct inode *inode); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); -- cgit v1.2.3 From 24df85ffb9712cd6060588f6e08defcda5986efe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 Oct 2025 13:16:03 -0400 Subject: allow to use CLASS() for struct filename * Not all users match that model, but most of them do. By the end of the series we'll be left with very few irregular ones... Added: CLASS(filename, name)(user_path) => getname(user_path) CLASS(filename_kernel, name)(string) => getname_kernel(string) CLASS(filename_flags, name)(user_path, flags) => getname_flags(user_path, flags) CLASS(filename_uflags, name)(user_path, flags) => getname_uflags(user_path, flags) CLASS(filename_maybe_null, name)(user_path, flags) => getname_maybe_null(user_path, flags) all with putname() as destructor. "flags" in filename_flags is in LOOKUP_... space, only LOOKUP_EMPTY matters. "flags" in filename_uflags and filename_maybe_null is in AT_...... space, and only AT_EMPTY_PATH matters. filename_flags conventions might be worth reconsidering later (it might or might not be better off with boolean instead) Signed-off-by: Al Viro --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f5c9cf28c4dc..d49b969ab432 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2517,6 +2517,12 @@ static inline struct filename *refname(struct filename *name) return name; } +DEFINE_CLASS(filename, struct filename *, putname(_T), getname(p), const char __user *p) +EXTEND_CLASS(filename, _kernel, getname_kernel(p), const char *p) +EXTEND_CLASS(filename, _flags, getname_flags(p, f), const char __user *p, unsigned int f) +EXTEND_CLASS(filename, _uflags, getname_uflags(p, f), const char __user *p, unsigned int f) +EXTEND_CLASS(filename, _maybe_null, getname_maybe_null(p, f), const char __user *p, unsigned int f) + extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); -- cgit v1.2.3 From 41670a5900a8866b8cab52ab5936b5e9ef06fe91 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Nov 2025 01:54:52 -0400 Subject: get rid of audit_reusename() Originally we tried to avoid multiple insertions into audit names array during retry loop by a cute hack - memorize the userland pointer and if there already is a match, just grab an extra reference to it. Cute as it had been, it had problems - two identical pointers had audit aux entries merged, two identical strings did not. Having different behaviour for syscalls that differ only by addresses of otherwise identical string arguments is obviously wrong - if nothing else, compiler can decide to merge identical string literals. Besides, this hack does nothing for non-audited processes - they get a fresh copy for retry. It's not time-critical, but having behaviour subtly differ that way is bogus. These days we have very few places that import filename more than once (9 functions total) and it's easy to massage them so we get rid of all re-imports. With that done, we don't need audit_reusename() anymore. There's no need to memorize userland pointer either. Acked-by: Paul Moore Signed-off-by: Al Viro --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index d49b969ab432..abe9c95c4874 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2411,7 +2411,6 @@ extern struct kobject *fs_kobj; struct audit_names; struct filename { const char *name; /* pointer to actual string */ - const __user char *uptr; /* original userland pointer */ atomic_t refcnt; struct audit_names *aname; const char iname[]; -- cgit v1.2.3 From c3a3577cdb351e74d6ff6bc328c3bee18ce69298 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 19 Nov 2025 19:19:24 -0500 Subject: struct filename: use names_cachep only for getname() and friends Instances of struct filename come from names_cachep (via __getname()). That is done by getname_flags() and getname_kernel() and these two are the main callers of __getname(). However, there are other callers that simply want to allocate PATH_MAX bytes for uses that have nothing to do with struct filename. We want saner allocation rules for long pathnames, so that struct filename would *always* come from names_cachep, with the out-of-line pathname getting kmalloc'ed. For that we need to be able to change the size of objects allocated by getname_flags()/getname_kernel(). That requires the rest of __getname() users to stop using names_cachep; we could explicitly switch all of those to kmalloc(), but that would cause quite a bit of noise. So the plan is to switch getname_...() to new helpers and turn __getname() into a wrapper for kmalloc(). Remaining __getname() users could be converted to explicit kmalloc() at leisure, hopefully along with figuring out what size do they really want - PATH_MAX is an overkill for some of them, used out of laziness ("we have a convenient helper that does 4K allocations and that's large enough, let's use it"). As a side benefit, names_cachep is no longer used outside of fs/namei.c, so we can move it there and be done with that. Signed-off-by: Al Viro --- include/linux/fs.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index abe9c95c4874..997d515bab32 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2539,10 +2539,8 @@ static inline int finish_open_simple(struct file *file, int error) extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); -extern struct kmem_cache *names_cachep; - -#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) -#define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) +#define __getname() kmalloc(PATH_MAX, GFP_KERNEL) +#define __putname(name) kfree(name) void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); -- cgit v1.2.3 From 8c888b31903cc2acfbf054c23d702caf68857810 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 19 Nov 2025 19:45:04 -0500 Subject: struct filename: saner handling of long names Always allocate struct filename from names_cachep, long name or short; short names would be embedded into struct filename. Longer ones do not cannibalize the original struct filename - put them into kmalloc'ed buffers (PATH_MAX-sized for import from userland, strlen() + 1 - for ones originating kernel-side, where we know the length beforehand). Cutoff length for short names is chosen so that struct filename would be 192 bytes long - that's both a multiple of 64 and large enough to cover the majority of real-world uses. Simplifies logics in getname()/putname() and friends. [fixed an embarrassing braino in EMBEDDED_NAME_MAX, first reported by Dan Carpenter] Signed-off-by: Al Viro --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 997d515bab32..f0f1e8034539 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2409,13 +2409,19 @@ extern struct kobject *fs_kobj; /* fs/open.c */ struct audit_names; -struct filename { + +struct __filename_head { const char *name; /* pointer to actual string */ atomic_t refcnt; struct audit_names *aname; - const char iname[]; +}; +#define EMBEDDED_NAME_MAX (192 - sizeof(struct __filename_head)) +struct filename { + struct __filename_head; + const char iname[EMBEDDED_NAME_MAX]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); +static_assert(sizeof(struct filename) % 64 == 0); static inline struct mnt_idmap *file_mnt_idmap(const struct file *file) { -- cgit v1.2.3 From 9fa3ec84587c5eca7580eafc27eee332bc3a5a0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Nov 2025 20:29:06 -0400 Subject: allow incomplete imports of filenames There are two filename-related problems in io_uring and its interplay with audit. Filenames are imported when request is submitted and used when it is processed. Unfortunately, the latter may very well happen in a different thread. In that case the reference to filename is put into the wrong audit_context - that of submitting thread, not the processing one. Audit logics is called by the latter, and it really wants to be able to find the names in audit_context current (== processing) thread. Another related problem is the headache with refcounts - normally all references to given struct filename are visible only to one thread (the one that uses that struct filename). io_uring violates that - an extra reference is stashed in audit_context of submitter. It gets dropped when submitter returns to userland, which can happen simultaneously with processing thread deciding to drop the reference it got. We paper over that by making refcount atomic, but that means pointless headache for everyone. Solution: the notion of partially imported filenames. Namely, already copied from userland, but *not* exposed to audit yet. io_uring can create that in submitter thread, and complete the import (obtaining the usual reference to struct filename) in processing thread. Object: struct delayed_filename. Primitives for working with it: delayed_getname(&delayed_filename, user_string) - copies the name from userland, returning 0 and stashing the address of (still incomplete) struct filename in delayed_filename on success and returning -E... on error. delayed_getname_uflags(&delayed_filename, user_string, atflags) - similar, in the same relation to delayed_getname() as getname_uflags() is to getname() complete_getname(&delayed_filename) - completes the import of filename stashed in delayed_filename and returns struct filename to caller, emptying delayed_filename. CLASS(filename_complete_delayed, name)(&delayed_filename) - variant of CLASS(filename) with complete_getname() for constructor. dismiss_delayed_filename(&delayed_filename) - destructor; drops whatever might be stashed in delayed_filename, emptying it. putname_to_delayed(&delayed_filename, name) - if name is shared, stashes its copy into delayed_filename and drops the reference to name, otherwise stashes the name itself in there. Signed-off-by: Al Viro --- include/linux/fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f0f1e8034539..f1612a7dffd0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2516,6 +2516,17 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f extern void putname(struct filename *name); DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) +struct delayed_filename { + struct filename *__incomplete_filename; // don't touch +}; +#define INIT_DELAYED_FILENAME(ptr) \ + ((void)(*(ptr) = (struct delayed_filename){})) +int delayed_getname(struct delayed_filename *, const char __user *); +int delayed_getname_uflags(struct delayed_filename *v, const char __user *, int); +void dismiss_delayed_filename(struct delayed_filename *); +int putname_to_delayed(struct delayed_filename *, struct filename *); +struct filename *complete_getname(struct delayed_filename *); + static inline struct filename *refname(struct filename *name) { atomic_inc(&name->refcnt); @@ -2527,6 +2538,7 @@ EXTEND_CLASS(filename, _kernel, getname_kernel(p), const char *p) EXTEND_CLASS(filename, _flags, getname_flags(p, f), const char __user *p, unsigned int f) EXTEND_CLASS(filename, _uflags, getname_uflags(p, f), const char __user *p, unsigned int f) EXTEND_CLASS(filename, _maybe_null, getname_maybe_null(p, f), const char __user *p, unsigned int f) +EXTEND_CLASS(filename, _complete_delayed, complete_getname(p), struct delayed_filename *p) extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); -- cgit v1.2.3 From 741c97fecb6a4160014a76759e9b8c0880fc44f1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Nov 2025 01:01:47 -0400 Subject: struct filename ->refcnt doesn't need to be atomic ... or visible outside of audit, really. Note that references held in delayed_filename always have refcount 1, and from the moment of complete_getname() or equivalent point in getname...() there won't be any references to struct filename instance left in places visible to other threads. Acked-by: Paul Moore Signed-off-by: Al Viro --- include/linux/fs.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f1612a7dffd0..6a26ee347517 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2412,7 +2412,7 @@ struct audit_names; struct __filename_head { const char *name; /* pointer to actual string */ - atomic_t refcnt; + int refcnt; struct audit_names *aname; }; #define EMBEDDED_NAME_MAX (192 - sizeof(struct __filename_head)) @@ -2527,12 +2527,6 @@ void dismiss_delayed_filename(struct delayed_filename *); int putname_to_delayed(struct delayed_filename *, struct filename *); struct filename *complete_getname(struct delayed_filename *); -static inline struct filename *refname(struct filename *name) -{ - atomic_inc(&name->refcnt); - return name; -} - DEFINE_CLASS(filename, struct filename *, putname(_T), getname(p), const char __user *p) EXTEND_CLASS(filename, _kernel, getname_kernel(p), const char *p) EXTEND_CLASS(filename, _flags, getname_flags(p, f), const char __user *p, unsigned int f) -- cgit v1.2.3 From 173e937552432db9406f04eb7905541b774ac7cd Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2026 12:39:50 +0000 Subject: fs: export may_delete() as may_delete_dentry() For many years btrfs as been using a copy of may_delete() in fs/btrfs/ioctl.c:btrfs_may_delete(). Everytime may_delete() is updated we need to update the btrfs copy, and this is a maintenance burden. Currently there are minor differences between both because the btrfs side lacks updates done in may_delete(). Export may_delete() so that btrfs can use it and with the less generic name may_delete_dentry(). While at it change the calls in vfs_rmdir() to pass a boolean literal instead of 1 and 0 as the last argument since the argument has a bool type. Signed-off-by: Filipe Manana Link: https://patch.msgid.link/e09128fd53f01b19d0a58f0e7d24739f79f47f6d.1768307858.git.fdmanana@suse.com Reviewed-by: David Sterba Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..06632783a76c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2657,6 +2657,9 @@ static inline int path_permission(const struct path *path, int mask) int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); +int may_delete_dentry(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *victim, bool isdir); + static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); -- cgit v1.2.3 From 26aab3a485d500cb89ef7340797982bd066f63a5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2026 12:39:51 +0000 Subject: fs: export may_create() as may_create_dentry() For many years btrfs as been using a copy of may_create() in fs/btrfs/ioctl.c:btrfs_may_create(). Everytime may_create() is updated we need to update the btrfs copy, and this is a maintenance burden. Currently there are minor differences between both because the btrfs side lacks updates done in may_create(). Export may_create() so that btrfs can use it and with the less generic name may_create_dentry(). Signed-off-by: Filipe Manana Link: https://patch.msgid.link/ce5174bca079f4cdcbb8dd145f0924feb1f227cd.1768307858.git.fdmanana@suse.com Reviewed-by: David Sterba Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 06632783a76c..2d28eff6eb6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2659,6 +2659,8 @@ int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, int may_delete_dentry(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir); +int may_create_dentry(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *child); static inline bool execute_ok(struct inode *inode) { -- cgit v1.2.3 From 55fb177d3a0346106974749374ae2191ba250825 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 28 Jan 2026 14:24:05 +0100 Subject: fs: add helpers name_is_dot{,dot,_dotdot} Rename the helper is_dot_dotdot() into the name_ namespace and add complementary helpers to check for dot and dotdot names individually. Signed-off-by: Amir Goldstein Link: https://patch.msgid.link/20260128132406.23768-3-amir73il@gmail.com Reviewed-by: Eric Biggers Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 094b0adcb035..95bb9a15e109 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2844,12 +2844,22 @@ u64 vfsmount_to_propagation_flags(struct vfsmount *mnt); extern char *file_path(struct file *, char *, int); +static inline bool name_is_dot(const char *name, size_t len) +{ + return unlikely(len == 1 && name[0] == '.'); +} + +static inline bool name_is_dotdot(const char *name, size_t len) +{ + return unlikely(len == 2 && name[0] == '.' && name[1] == '.'); +} + /** - * is_dot_dotdot - returns true only if @name is "." or ".." + * name_is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes */ -static inline bool is_dot_dotdot(const char *name, size_t len) +static inline bool name_is_dot_dotdot(const char *name, size_t len) { return len && unlikely(name[0] == '.') && (len == 1 || (len == 2 && name[1] == '.')); -- cgit v1.2.3