From 8b17e540969a0983abe96ffc352397890ab67bf1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Feb 2025 19:55:20 +0100 Subject: vfs: add initial support for CONFIG_DEBUG_VFS Small collection of macros taken from mmdebug.h Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250209185523.745956-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + include/linux/vfsdebug.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 include/linux/vfsdebug.h (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f..d5e3fb14ad8c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include #include #include #include diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h new file mode 100644 index 000000000000..9cf22d3eb9dd --- /dev/null +++ b/include/linux/vfsdebug.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VFS_DEBUG_H +#define LINUX_VFS_DEBUG_H 1 + +#include + +struct inode; + +#ifdef CONFIG_DEBUG_VFS +void dump_inode(struct inode *inode, const char *reason); + +#define VFS_BUG_ON(cond) BUG_ON(cond) +#define VFS_WARN_ON(cond) (void)WARN_ON(cond) +#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VFS_WARN(cond, format...) (void)WARN(cond, format) + +#define VFS_BUG_ON_INODE(cond, inode) ({ \ + if (unlikely(!!(cond))) { \ + dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\ + BUG_ON(1); \ + } \ +}) + +#define VFS_WARN_ON_INODE(cond, inode) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) +#else +#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) + +#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond) +#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond) +#endif /* CONFIG_DEBUG_VFS */ + +#endif -- cgit v1.2.3 From 3eb7e95104141609d4aed15affadedb81c408f03 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Feb 2025 19:55:22 +0100 Subject: vfs: use the new debug macros in inode_set_cached_link() Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250209185523.745956-4-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d5e3fb14ad8c..e71d58c7f59c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -792,6 +792,8 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { + VFS_WARN_ON_INODE(strlen(link) != linklen, inode); + VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; -- cgit v1.2.3 From d6ff4c8f65220b20c550f12eb8921827c459600e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 19 Jan 2025 11:32:05 +0100 Subject: fs: avoid mmap sem relocks when coredumping with many missing pages Dumping processes with large allocated and mostly not-faulted areas is very slow. Borrowing a test case from Tavian Barnes: int main(void) { char *mem = mmap(NULL, 1ULL << 40, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); printf("%p %m\n", mem); if (mem != MAP_FAILED) { mem[0] = 1; } abort(); } That's 1TB of almost completely not-populated area. On my test box it takes 13-14 seconds to dump. The profile shows: - 99.89% 0.00% a.out entry_SYSCALL_64_after_hwframe do_syscall_64 syscall_exit_to_user_mode arch_do_signal_or_restart - get_signal - 99.89% do_coredump - 99.88% elf_core_dump - dump_user_range - 98.12% get_dump_page - 64.19% __get_user_pages - 40.92% gup_vma_lookup - find_vma - mt_find 4.21% __rcu_read_lock 1.33% __rcu_read_unlock - 3.14% check_vma_flags 0.68% vma_is_secretmem 0.61% __cond_resched 0.60% vma_pgtable_walk_end 0.59% vma_pgtable_walk_begin 0.58% no_page_table - 15.13% down_read_killable 0.69% __cond_resched 13.84% up_read 0.58% __cond_resched Almost 29% of the time is spent relocking the mmap semaphore between calls to get_dump_page() which find nothing. Whacking that results in times of 10 seconds (down from 13-14). While here make the thing killable. The real problem is the page-sized iteration and the real fix would patch it up instead. It is left as an exercise for the mm-familiar reader. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250119103205.2172432-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..78f9e12cc861 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2549,7 +2549,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -struct page *get_dump_page(unsigned long addr); +struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); -- cgit v1.2.3 From 29d80d506b18384f64a54b01fae78184e2d327f3 Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Tue, 21 Jan 2025 16:08:22 +0900 Subject: open: Fix return type of several functions from long to int Fix the return type of several functions from long to int to match its actu al behavior. These functions only return int values. This change improves type consistency across the filesystem code and aligns the function signatu re with its existing implementation and usage. Reviewed-by: Jan Kara Signed-off-by: Yuichiro Tsuji Link: https://lore.kernel.org/r/20250121070844.4413-2-yuichtsu@amazon.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- include/linux/syscalls.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e71d58c7f59c..d1f1673956e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2785,13 +2785,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) return mnt_idmap(mnt) != &nop_mnt_idmap; } -extern long vfs_truncate(const struct path *, loff_t); +int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern long do_sys_open(int dfd, const char __user *filename, int flags, - umode_t mode); +int do_sys_open(int dfd, const char __user *filename, int flags, + umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d451..bae4490c1dda 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1266,14 +1266,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, AT_SYMLINK_NOFOLLOW); } -extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } -extern long do_sys_truncate(const char __user *pathname, loff_t length); +int do_sys_truncate(const char __user *pathname, loff_t length); static inline long ksys_truncate(const char __user *pathname, loff_t length) { -- cgit v1.2.3 From f326565c44419380d18290edee5f78921418f7a5 Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Tue, 21 Jan 2025 16:08:23 +0900 Subject: ioctl: Fix return type of several functions from long to int Fix the return type of several functions from long to int to match its actu al behavior. These functions only return int values. This change improves type consistency across the filesystem code and aligns the function signatu re with its existing implementation and usage. Reviewed-by: Jan Kara Signed-off-by: Yuichiro Tsuji Link: https://lore.kernel.org/r/20250121070844.4413-3-yuichtsu@amazon.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d1f1673956e2..c1763b022d06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2030,7 +2030,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, -- cgit v1.2.3 From 1bb772565f327291b0a463b125c7646dc45ae8b4 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 01:01:05 +0100 Subject: vfs: inline getname() It is merely a trivial wrapper around getname_flags which adds a zeroed argument, no point paying for an extra call. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206000105.432528-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c1763b022d06..3e07e4a44de6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2842,7 +2842,10 @@ extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); -extern struct filename *getname(const char __user *); +static inline struct filename *getname(const char __user *name) +{ + return getname_flags(name, 0); +} extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) -- cgit v1.2.3 From 1479be62582dbd2078390ef609f8f5ef351c15e8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 12 Feb 2025 19:04:59 +0100 Subject: vfs: inline new_inode_pseudo() and de-staticize alloc_inode() The former is a no-op wrapper with the same argument. I left it in place to not lose the information who needs it -- one day "pseudo" inodes may start differing from what alloc_inode() returns. In the meantime no point taking a detour. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250212180459.1022983-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e07e4a44de6..70c985f3d787 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3285,7 +3285,11 @@ static inline void __iget(struct inode *inode) extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode_pseudo(struct super_block *sb); +struct inode *alloc_inode(struct super_block *sb); +static inline struct inode *new_inode_pseudo(struct super_block *sb) +{ + return alloc_inode(sb); +} extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -- cgit v1.2.3 From 12851bd921d429c60578b90916fc220b60757c34 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 21 Feb 2025 20:39:29 +0000 Subject: fs: Turn page_offset() into a wrapper around folio_pos() This is far less efficient for the lagging filesystems which still use page_offset(), but it removes an access to page->index. It also fixes a bug -- if any filesystem passed a tail page to page_offset(), it would return garbage which might result in the filesystem choosing to not writeback a dirty page. There probably aren't any examples of this, but I can't be certain. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250221203932.3588740-1-willy@infradead.org Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..f348e7005306 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio, return folio->index + folio_page_idx(folio, page); } -/* - * Return byte-offset into filesystem object for page. +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. */ -static inline loff_t page_offset(struct page *page) +static inline loff_t folio_pos(const struct folio *folio) { - return ((loff_t)page->index) << PAGE_SHIFT; + return ((loff_t)folio->index) * PAGE_SIZE; } -/** - * folio_pos - Returns the byte position of this folio in its file. - * @folio: The folio. +/* + * Return byte-offset into filesystem object for page. */ -static inline loff_t folio_pos(struct folio *folio) +static inline loff_t page_offset(struct page *page) { - return page_offset(&folio->page); + struct folio *folio = page_folio(page); + + return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* -- cgit v1.2.3 From e249056c91a2f14ee40de2bf24cf72d8e68101f5 Mon Sep 17 00:00:00 2001 From: Pan Deng Date: Fri, 28 Feb 2025 10:00:59 +0800 Subject: fs: place f_ref to 3rd cache line in struct file to resolve false sharing When running syscall pread in a high core count system, f_ref contends with the reading of f_mode, f_op, f_mapping, f_inode, f_flags in the same cache line. This change places f_ref to the 3rd cache line where fields are not updated as frequently as the 1st cache line, and the contention is grealy reduced according to tests. In addition, the size of file object is kept in 3 cache lines. This change has been tested with rocksdb benchmark readwhilewriting case in 1 socket 64 physical core 128 logical core baremetal machine, with build config CONFIG_RANDSTRUCT_NONE=y Command: ./db_bench --benchmarks="readwhilewriting" --threads $cnt --duration 60 The throughput(ops/s) is improved up to ~21%. ===== thread baseline compare 16 100% +1.3% 32 100% +2.2% 64 100% +7.2% 128 100% +20.9% It was also tested with UnixBench: syscall, fsbuffer, fstime, fsdisk cases that has been used for file struct layout tuning, no regression was observed. Signed-off-by: Pan Deng Link: https://lore.kernel.org/r/20250228020059.3023375-1-pan.deng@intel.com Tested-by: Lipeng Zhu Reviewed-by: Tianyou Li Reviewed-by: Tim Chen Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 70c985f3d787..9ab789cd1531 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1058,7 +1058,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1068,12 +1067,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener + * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file - * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file @@ -1081,9 +1080,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) + * @f_ref: reference count */ struct file { - file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1093,6 +1092,7 @@ struct file { unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; + struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { @@ -1106,7 +1106,6 @@ struct file { void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ - struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL @@ -1118,6 +1117,7 @@ struct file { struct file_ra_state f_ra; freeptr_t f_freeptr; }; + file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ -- cgit v1.2.3 From 611851010c74046c0bc2b0461b72a6fae81c16d0 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 13 Mar 2025 15:27:44 +0100 Subject: fs: dedup handling of struct filename init and refcounts bumps No functional changes. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250313142744.1323281-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ab789cd1531..ae1fce54eb60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2859,6 +2859,12 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f } extern void putname(struct filename *name); +static inline struct filename *refname(struct filename *name) +{ + atomic_inc(&name->refcnt); + return name; +} + extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); -- cgit v1.2.3