From 73fa7547c70b32cc69685f79be31135797734eb6 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 31 Aug 2020 11:32:08 -0400 Subject: vfs: add RWF_NOAPPEND flag for pwritev2 The pwrite function, originally defined by POSIX (thus the "p"), is defined to ignore O_APPEND and write at the offset passed as its argument. However, historically Linux honored O_APPEND if set and ignored the offset. This cannot be changed due to stability policy, but is documented in the man page as a bug. Now that there's a pwritev2 syscall providing a superset of the pwrite functionality that has a flags argument, the conforming behavior can be offered to userspace via a new flag. Since pwritev2 checks flag validity (in kiocb_set_rw_flags) and reports unknown ones with EOPNOTSUPP, callers will not get wrong behavior on old kernels that don't support the new flag; the error is reported and the caller can decide how to handle it. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..4f7cfda29143 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; + if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) + return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) @@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; + if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { + if (IS_APPEND(file_inode(ki->ki_filp))) + return -EPERM; + ki->ki_flags &= ~IOCB_APPEND; + } + ki->ki_flags |= kiocb_flags; return 0; } -- cgit v1.2.3 From 42c3732fa8073717dd7d924472f1c0bc5b452fdc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 30 Dec 2023 19:46:00 -0500 Subject: fs: Create a generic is_dot_dotdot() utility De-duplicate the same functionality in several places by hoisting the is_dot_dotdot() utility function into linux/fs.h. Suggested-by: Amir Goldstein Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Chuck Lever --- include/linux/fs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..baa64344a308 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2846,6 +2846,17 @@ extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); +/** + * is_dot_dotdot - returns true only if @name is "." or ".." + * @name: file name to check + * @len: length of file name, in bytes + */ +static inline bool is_dot_dotdot(const char *name, size_t len) +{ + return len && unlikely(name[0] == '.') && + (len == 1 || (len == 2 && name[1] == '.')); +} + #include /* needed for stackable file system support */ -- cgit v1.2.3 From d8f899d13d72d285db43dbb9df1acaed22d8c4e7 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:55 +0800 Subject: fs: make the i_size_read/write helpers be smp_load_acquire/store_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In [Link] Linus mentions that acquire/release makes it clear which _particular_ memory accesses are the ordered ones, and it's unlikely to make any performance difference, so it's much better to pair up the release->acquire ordering than have a "wmb->rmb" ordering. ========================================================= update pagecache folio_mark_uptodate(folio) smp_wmb() set_bit PG_uptodate === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size) folio_test_uptodate(folio) test_bit PG_uptodate smp_rmb() === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size) copy_page_to_iter() ========================================================= Calling smp_store_release() in i_size_write() ensures that the data in the page and the PG_uptodate bit are updated before the isize is updated, and calling smp_load_acquire() in i_size_read ensures that it will not read a newer isize than the data in the page. Therefore, this avoids buffered read-write inconsistencies caused by Load-Load reordering. Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6669147b9e..ebce4763b4bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode) preempt_enable(); return i_size; #else - return inode->i_size; + /* Pairs with smp_store_release() in i_size_write() */ + return smp_load_acquire(&inode->i_size); #endif } @@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) inode->i_size = i_size; preempt_enable(); #else - inode->i_size = i_size; + /* + * Pairs with smp_load_acquire() in i_size_read() to ensure + * changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); #endif } -- cgit v1.2.3 From d3b1a9a778e1a014c5331d1e8d4863fd999eb0b5 Mon Sep 17 00:00:00 2001 From: JonasZhou Date: Fri, 2 Feb 2024 16:33:04 +0800 Subject: fs/address_space: move i_mmap_rwsem to mitigate a false sharing with i_mmap. In the struct address_space, there is a 32-byte gap between i_mmap and i_mmap_rwsem. Due to the alignment of struct address_space variables to 8 bytes, in certain situations, i_mmap and i_mmap_rwsem may end up in the same CACHE line. While running Unixbench/execl, we observe high false sharing issues when accessing i_mmap against i_mmap_rwsem. We move i_mmap_rwsem after i_private_list, ensuring a 64-byte gap between i_mmap and i_mmap_rwsem. For Intel Silver machines (2 sockets) using kernel v6.8 rc-2, the score of Unixbench/execl improves by ~3.94%, and the score of Unixbench/shell improves by ~3.26%. Baseline: ------------------------------------------------------------- 162 546 748 11374 21 0xffff92e266af90c0 ------------------------------------------------------------- 46.89% 44.65% 0.00% 0.00% 0x0 1 1 0xffffffff86d5fb96 460 258 271 1069 32 [k] __handle_mm_fault [kernel.vmlinux] memory.c:2940 0 1 4.21% 4.41% 0.00% 0.00% 0x4 1 1 0xffffffff86d0ed54 473 311 288 95 28 [k] filemap_read [kernel.vmlinux] atomic.h:23 0 1 0.00% 0.00% 0.04% 4.76% 0x8 1 1 0xffffffff86d4bcf1 0 0 0 5 4 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:204 0 1 6.41% 6.02% 0.00% 0.00% 0x8 1 1 0xffffffff86d4ba85 411 271 339 210 32 [k] vma_interval_tree_insert [kernel.vmlinux] interval_tree.c:23 0 1 0.00% 0.00% 0.47% 95.24% 0x10 1 1 0xffffffff86d4bd34 0 0 0 74 32 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:339 0 1 0.37% 0.13% 0.00% 0.00% 0x10 1 1 0xffffffff86d4bb4f 328 212 380 7 5 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 5.13% 5.08% 0.00% 0.00% 0x10 1 1 0xffffffff86d4bb4b 416 255 357 197 32 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 1.10% 0.53% 0.00% 0.00% 0x28 1 1 0xffffffff86e06eb8 395 228 351 24 14 [k] do_dentry_open [kernel.vmlinux] open.c:966 0 1 1.10% 2.14% 57.07% 0.00% 0x38 1 1 0xffffffff878c9225 1364 792 462 7003 32 [k] down_write [kernel.vmlinux] atomic64_64.h:109 0 1 0.00% 0.00% 0.01% 0.00% 0x38 1 1 0xffffffff878c8e75 0 0 252 3 2 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:109 0 1 0.00% 0.13% 0.00% 0.00% 0x38 1 1 0xffffffff878c8e23 0 596 63 2 2 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:15 0 1 2.38% 2.94% 6.53% 0.00% 0x38 1 1 0xffffffff878c8ccb 1150 818 570 1197 32 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:109 0 1 30.59% 32.22% 0.00% 0.00% 0x38 1 1 0xffffffff878c8cb4 423 251 380 648 32 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:15 0 1 1.83% 1.74% 35.88% 0.00% 0x38 1 1 0xffffffff86b4f833 1217 1112 565 4586 32 [k] up_write [kernel.vmlinux] atomic64_64.h:91 0 1 with this change: ------------------------------------------------------------- 360 12 300 57 35 0xffff982cdae76400 ------------------------------------------------------------- 50.00% 59.67% 0.00% 0.00% 0x0 1 1 0xffffffff8215fb86 352 200 191 558 32 [k] __handle_mm_fault [kernel.vmlinux] memory.c:2940 0 1 8.33% 5.00% 0.00% 0.00% 0x4 1 1 0xffffffff8210ed44 370 284 263 42 24 [k] filemap_read [kernel.vmlinux] atomic.h:23 0 1 0.00% 0.00% 5.26% 2.86% 0x8 1 1 0xffffffff8214bce1 0 0 0 4 4 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:204 0 1 33.33% 14.33% 0.00% 0.00% 0x8 1 1 0xffffffff8214ba75 344 186 219 140 32 [k] vma_interval_tree_insert [kernel.vmlinux] interval_tree.c:23 0 1 0.00% 0.00% 94.74% 97.14% 0x10 1 1 0xffffffff8214bd24 0 0 0 88 29 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:339 0 1 8.33% 20.00% 0.00% 0.00% 0x10 1 1 0xffffffff8214bb3b 296 209 226 167 31 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 0.00% 0.67% 0.00% 0.00% 0x28 1 1 0xffffffff82206f45 0 140 334 4 3 [k] do_dentry_open [kernel.vmlinux] open.c:966 0 1 0.00% 0.33% 0.00% 0.00% 0x38 1 1 0xffffffff8250a6c4 0 286 126 5 5 [k] errseq_sample [kernel.vmlinux] errseq.c:125 0 Signed-off-by: JonasZhou Link: https://lore.kernel.org/r/20240202083304.10995-1-JonasZhou-oc@zhaoxin.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ebce4763b4bb..9efd6220b7c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -482,10 +482,10 @@ struct address_space { pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; - struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; + struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* -- cgit v1.2.3 From c69ff4071935f946f1cddc59e1d36a03442ed015 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:02:28 -0500 Subject: filelock: split leases out of struct file_lock Add a new struct file_lease and move the lease-specific fields from struct file_lock to it. Convert the appropriate API calls to take struct file_lease instead, and convert the callers to use them. There is zero overlap between the lock manager operations for file locks and the ones for file leases, so split the lease-related operations off into a new lease_manager_operations struct. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-47-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..162877197bf1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1064,6 +1064,7 @@ struct file *get_file_active(struct file **f); typedef void *fl_owner_t; struct file_lock; +struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX @@ -2005,7 +2006,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); - int (*setlease)(struct file *, int, struct file_lock **, void **); + int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); @@ -3238,7 +3239,7 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); -extern int simple_nosetlease(struct file *, int, struct file_lock **, void **); +extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); -- cgit v1.2.3 From fe3944fb245ab99570552a3bf970b00058a9ca6d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Feb 2024 12:39:23 -0800 Subject: fs: Move enum rw_hint into a new header file Move enum rw_hint into a new header file to prepare for using this data type in the block layer. Add the attribute __packed to reduce the space occupied by instances of this data type from four bytes to one byte. Change the data type of i_write_hint from u8 into enum rw_hint. Reviewed-by: Christoph Hellwig Acked-by: Chao Yu # for the F2FS part Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240202203926.2478590-5-bvanassche@acm.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..bdabda5dc364 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -309,19 +310,6 @@ struct address_space; struct writeback_control; struct readahead_control; -/* - * Write life time hint values. - * Stored in struct inode as u8. - */ -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, - WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, - WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, - WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, - WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, -}; - /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC @@ -677,7 +665,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; - u8 i_write_hint; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED -- cgit v1.2.3 From 3058fca1ed7955c904584a6d86108d664a927177 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:01:31 +0200 Subject: fs: make file_dentry() a simple accessor file_dentry() is a relic from the days that overlayfs was using files with a "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. In those days, file_dentry() was needed to get the underlying fs dentry that matches f_inode. Files with "fake" path should not exist nowadays, so make file_dentry() a simple accessor and use an assertion to make sure that file_dentry() was not papering over filesystem bugs. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202110132.1584111-2-amir73il@gmail.com Tested-by: Stefan Berger Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9efd6220b7c6..2e07cbbf92e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1084,9 +1084,20 @@ static inline struct inode *file_inode(const struct file *f) return f->f_inode; } +/* + * file_dentry() is a relic from the days that overlayfs was using files with a + * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. + * In those days, file_dentry() was needed to get the underlying fs dentry that + * matches f_inode. + * Files with "fake" path should not exist nowadays, so use an assertion to make + * sure that file_dentry() was not papering over filesystem bugs. + */ static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file)); + struct dentry *dentry = file->f_path.dentry; + + WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); + return dentry; } struct fasync_struct { -- cgit v1.2.3 From 853b8d7597eea4ccaaefbcf0942cd42fc86d542a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 12:22:58 +0200 Subject: remap_range: merge do_clone_file_range() into vfs_clone_file_range() commit dfad37051ade ("remap_range: move permission hooks out of do_clone_file_range()") moved the permission hooks from do_clone_file_range() out to its caller vfs_clone_file_range(), but left all the fast sanity checks in do_clone_file_range(). This makes the expensive security hooks be called in situations that they would not have been called before (e.g. fs does not support clone). The only reason for the do_clone_file_range() helper was that overlayfs did not use to be able to call vfs_clone_file_range() from copy up context with sb_writers lock held. However, since commit c63e56a4a652 ("ovl: do not open/llseek lower file with upper sb_writers held"), overlayfs just uses an open coded version of vfs_clone_file_range(). Merge_clone_file_range() into vfs_clone_file_range(), restoring the original order of checks as it was before the regressing commit and adapt the overlayfs code to call vfs_clone_file_range() before the permission hooks that were added by commit ca7ab482401c ("ovl: add permission hooks outside of do_splice_direct()"). Note that in the merge of do_clone_file_range(), the file_start_write() context was reduced to cover ->remap_file_range() without holding it over the permission hooks, which was the reason for doing the regressing commit in the first place. Reported-and-tested-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202401312229.eddeb9a6-oliver.sang@intel.com Fixes: dfad37051ade ("remap_range: move permission hooks out of do_clone_file_range()") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202102258.1582671-1-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..023f37c60709 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2101,9 +2101,6 @@ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); -extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -- cgit v1.2.3 From ccb49011bb2ebfd66164dbf68c5bff48917bb5ef Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 6 Feb 2024 15:08:19 +0100 Subject: quota: Properly annotate i_dquot arrays with __rcu Dquots pointed to from i_dquot arrays in inodes are protected by dquot_srcu. Annotate them as such and change .get_dquots callback to return properly annotated pointer to make sparse happy. Fixes: b9ba6f94b238 ("quota: remove dqptr_sem") Signed-off-by: Jan Kara --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..d0b849e4f6cd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2159,7 +2159,7 @@ struct super_operations { #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - struct dquot **(*get_dquots)(struct inode *); + struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); -- cgit v1.2.3 From a4af51ce229b1e1eab003966dbfebf9d80093a77 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 21:56:15 -0500 Subject: fs: super_set_uuid() Some weird old filesytems have UUID-like things that we wish to expose as UUIDs, but are smaller; add a length field so that the new FS_IOC_(GET|SET)UUID ioctls can handle them in generic code. And add a helper super_set_uuid(), for setting nonstandard length uuids. Helper is now required for the new FS_IOC_GETUUID ioctl; if super_set_uuid() hasn't been called, the ioctl won't be supported. Reviewed-by: Dave Chinner Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240207025624.1019754-2-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- include/linux/fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..acdc56987cb1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1257,6 +1257,7 @@ struct super_block { char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ + u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ unsigned int s_max_links; @@ -2532,6 +2533,14 @@ extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); +static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) +{ + if (WARN_ON(len > sizeof(sb->s_uuid))) + len = sizeof(sb->s_uuid); + sb->s_uuid_len = len; + memcpy(&sb->s_uuid, uuid, len); +} + extern int current_umask(void); extern void ihold(struct inode * inode); -- cgit v1.2.3 From ae8c511757304e0c393661b5ed2ad7073e2a351d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 21:56:19 -0500 Subject: fs: add FS_IOC_GETFSSYSFSPATH Add a new ioctl for getting the sysfs name of a filesystem - the path under /sys/fs. This is going to let us standardize exporting data from sysfs across filesystems, e.g. time stats. The returned path will always be of the form "$FSTYP/$SYSFS_IDENTIFIER", where the sysfs identifier may be a UUID (for bcachefs) or a device name (xfs). Cc: Christian Brauner Cc: Jan Kara Cc: Dave Chinner Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: Josef Bacik Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240207025624.1019754-6-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- include/linux/fs.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index acdc56987cb1..c6d9e1b7032c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1255,10 +1255,23 @@ struct super_block { struct fsnotify_mark_connector __rcu *s_fsnotify_marks; #endif + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + unsigned int s_max_links; /* @@ -2541,6 +2554,36 @@ static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsign memcpy(&sb->s_uuid, uuid, len); } +/* set sb sysfs name based on sb->s_bdev */ +static inline void super_set_sysfs_name_bdev(struct super_block *sb) +{ + snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); +} + +/* set sb sysfs name based on sb->s_uuid */ +static inline void super_set_sysfs_name_uuid(struct super_block *sb) +{ + WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); + snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); +} + +/* set sb sysfs name based on sb->s_id */ +static inline void super_set_sysfs_name_id(struct super_block *sb) +{ + strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); +} + +/* try to use something standard before you use this */ +__printf(2, 3) +static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); + va_end(args); +} + extern int current_umask(void); extern void ihold(struct inode * inode); -- cgit v1.2.3 From ecba88a3b32d733d41e27973e25b2bc580f64281 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:54 -0500 Subject: libfs: Add simple_offset_empty() For simple filesystems that use directory offset mapping, rely strictly on the directory offset map to tell when a directory has no children. After this patch is applied, the emptiness test holds only the RCU read lock when the directory being tested has no children. In addition, this adds another layer of confirmation that simple_offset_add/remove() are working as expected. Reviewed-by: Jan Kara Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820143463.6328.7872919188371286951.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..03d141809a2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3267,6 +3267,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_empty(struct dentry *dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, -- cgit v1.2.3 From 0e4a862174f2a8d1653a8a9cf0815020e1d3af24 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:24:16 -0500 Subject: libfs: Convert simple directory offsets to use a Maple Tree Test robot reports: > kernel test robot noticed a -19.0% regression of aim9.disk_src.ops_per_sec on: > > commit: a2e459555c5f9da3e619b7e47a63f98574dc75f1 ("shmem: stable directory offsets") > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master Feng Tang further clarifies that: > ... the new simple_offset_add() > called by shmem_mknod() brings extra cost related with slab, > specifically the 'radix_tree_node', which cause the regression. Willy's analysis is that, over time, the test workload causes xa_alloc_cyclic() to fragment the underlying SLAB cache. This patch replaces the offset_ctx's xarray with a Maple Tree in the hope that Maple Tree's dense node mode will handle this scenario more scalably. In addition, we can widen the simple directory offset maximum to signed long (as loff_t is also signed). Suggested-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309081306.3ecb3734-oliver.sang@intel.com Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820145616.6328.12620992971699079156.stgit@91.116.238.104.host.secureserver.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 03d141809a2c..55144c12ee0f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -3260,8 +3261,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { - struct xarray xa; - u32 next_offset; + struct maple_tree mt; + unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); -- cgit v1.2.3 From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 15 Feb 2024 12:47:38 -0800 Subject: fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the following kernel warning appears: WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 Call trace: kiocb_set_cancel_fn+0x9c/0xa8 ffs_epfile_read_iter+0x144/0x1d0 io_read+0x19c/0x498 io_issue_sqe+0x118/0x27c io_submit_sqes+0x25c/0x5fc __arm64_sys_io_uring_enter+0x104/0xab0 invoke_syscall+0x58/0x11c el0_svc_common+0xb4/0xf4 do_el0_svc+0x2c/0xb0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x68/0xb4 el0t_64_sync+0x1a4/0x1a8 Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is submitted by libaio. Suggested-by: Jens Axboe Cc: Christoph Hellwig Cc: Avi Kivity Cc: Sandeep Dhavale Cc: Jens Axboe Cc: Greg Kroah-Hartman Cc: Kent Overstreet Cc: stable@vger.kernel.org Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..c2dcc98cb4c8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -352,6 +352,8 @@ enum rw_hint { * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) +/* kiocb is a read or write operation submitted by fs/aio.c. */ +#define IOCB_AIO_RW (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ -- cgit v1.2.3 From f3a608827d1f8de0dd12813e8d9c6803fe64e119 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Feb 2024 18:47:35 +0100 Subject: bdev: open block device as files Add two new helpers to allow opening block devices as files. This is not the final infrastructure. This still opens the block device before opening a struct a file. Until we have removed all references to struct bdev_handle we can't switch the order: * Introduce blk_to_file_flags() to translate from block specific to flags usable to pen a new file. * Introduce bdev_file_open_by_{dev,path}(). * Introduce temporary sb_bdev_handle() helper to retrieve a struct bdev_handle from a block device file and update places that directly reference struct bdev_handle to rely on it. * Don't count block device openes against the number of open files. A bdev_file_open_by_{dev,path}() file is never installed into any file descriptor table. One idea that came to mind was to use kernel_tmpfile_open() which would require us to pass a path and it would then call do_dentry_open() going through the regular fops->open::blkdev_open() path. But then we're back to the problem of routing block specific flags such as BLK_OPEN_RESTRICT_WRITES through the open path and would have to waste FMODE_* flags every time we add a new one. With this we can avoid using a flag bit and we have more leeway in how we open block devices from bdev_open_by_{dev,path}(). Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-1-adbd023e19cc@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..e9291e27cc47 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1228,8 +1228,8 @@ struct super_block { #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; - struct bdev_handle *s_bdev_handle; + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; @@ -1327,6 +1327,12 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; +/* Temporary helper that will go away. */ +static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb) +{ + return sb->s_bdev_file->private_data; +} + static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; -- cgit v1.2.3 From a56aefca8d386181415a1fb7cfec2f72b0404797 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:46 +0100 Subject: bdev: make struct bdev_handle private to the block layer Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-29-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e9291e27cc47..6e0714d35d9b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1327,12 +1327,6 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; -/* Temporary helper that will go away. */ -static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb) -{ - return sb->s_bdev_file->private_data; -} - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; -- cgit v1.2.3 From 2824083db76cb9d4b7910607b367e93b02912865 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:03 -0500 Subject: ovl: Always reject mounting over case-insensitive directories overlayfs relies on the filesystem setting DCACHE_OP_HASH or DCACHE_OP_COMPARE to reject mounting over case-insensitive directories. Since commit bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops"), we set ->d_op through a hook in ->d_lookup, which means the root dentry won't have them, causing the mount to accidentally succeed. In v6.7-rc7, the following sequence will succeed to mount, but any dentry other than the root dentry will be a "weird" dentry to ovl and fail with EREMOTE. mkfs.ext4 -O casefold lower.img mount -O loop lower.img lower mount -t overlay -o lowerdir=lower,upperdir=upper,workdir=work ovl /mnt Mounting on a subdirectory fails, as expected, because DCACHE_OP_HASH and DCACHE_OP_COMPARE are properly set by ->lookup. Fix by explicitly rejecting superblocks that allow case-insensitive dentries. Yes, this will be solved when we move d_op configuration back to ->s_d_op. Yet, we better have an explicit fix to avoid messing up again. While there, re-sort the entries to have more descriptive error messages first. Fixes: bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops") Acked-by: Amir Goldstein Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-2-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e6ba0cc6f2ee..a0eb8b5759a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3282,6 +3282,15 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +static inline bool sb_has_encoding(const struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + return !!sb->s_encoding; +#else + return false; +#endif +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); -- cgit v1.2.3 From 70dfe3f0d239c2e8abc6a7bea24411031f85b652 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:08 -0500 Subject: libfs: Add helper to choose dentry operations at mount-time In preparation to drop the similar helper that sets d_op at lookup time, add a version to set the right d_op filesystem-wide, through sb->s_d_op. The operations structures are shared across filesystems supporting fscrypt and/or casefolding, therefore we can keep it in common libfs code. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-7-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0eb8b5759a6..383c5145465f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3281,6 +3281,7 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +extern void generic_set_sb_d_ops(struct super_block *sb); static inline bool sb_has_encoding(const struct super_block *sb) { -- cgit v1.2.3 From 101c3fad29d7a0a90ff063b1aad586a0211911ec Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:12 -0500 Subject: libfs: Drop generic_set_encrypted_ci_d_ops No filesystems depend on it anymore, and it is generally a bad idea. Since all dentries should have the same set of dentry operations in case-insensitive capable filesystems, it should be propagated through ->s_d_op. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-11-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 383c5145465f..ff1338109b54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3280,7 +3280,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); extern void generic_set_sb_d_ops(struct super_block *sb); static inline bool sb_has_encoding(const struct super_block *sb) -- cgit v1.2.3 From 66a67c860cce3643248f7e80ee095b946829a342 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 28 Feb 2024 18:28:48 -0500 Subject: fs: file_remove_privs_flags() Rename and export __file_remove_privs(); for a buffered write path that doesn't take the inode lock we need to be able to check if the operation needs to do work first. Signed-off-by: Kent Overstreet Cc: Alexander Viro Cc: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1fbc72c5f112..14ea66b62823 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3004,6 +3004,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); +extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); -- cgit v1.2.3