summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 09:02:39 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 09:02:39 -0700
commit181d8e399f50c0683b12d40432bb6e1ca5c58d37 (patch)
treed1bb2c62d88c645f4ba3569d0eac14f76108d980 /include/linux
parenta1ae8ce78bb2600490d49a8f1ee88767b0e64381 (diff)
parent76145cb37ff0636fdf2a15320b2c2421915df32b (diff)
Merge tag 'vfs-6.16-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull misc vfs updates from Christian Brauner: "This contains the usual selections of misc updates for this cycle. Features: - Use folios for symlinks in the page cache FUSE already uses folios for its symlinks. Mirror that conversion in the generic code and the NFS code. That lets us get rid of a few folio->page->folio conversions in this path, and some of the few remaining users of read_cache_page() / read_mapping_page() - Try and make a few filesystem operations killable on the VFS inode->i_mutex level - Add sysctl vfs_cache_pressure_denom for bulk file operations Some workloads need to preserve more dentries than we currently allow through out sysctl interface A HDFS servers with 12 HDDs per server, on a HDFS datanode startup involves scanning all files and caching their metadata (including dentries and inodes) in memory. Each HDD contains approximately 2 million files, resulting in a total of ~20 million cached dentries after initialization To minimize dentry reclamation, they set vfs_cache_pressure to 1. Despite this configuration, memory pressure conditions can still trigger reclamation of up to 50% of cached dentries, reducing the cache from 20 million to approximately 10 million entries. During the subsequent cache rebuild period, any HDFS datanode restart operation incurs substantial latency penalties until full cache recovery completes To maintain service stability, more dentries need to be preserved during memory reclamation. The current minimum reclaim ratio (1/100 of total dentries) remains too aggressive for such workload. This patch introduces vfs_cache_pressure_denom for more granular cache pressure control The configuration [vfs_cache_pressure=1, vfs_cache_pressure_denom=10000] effectively maintains the full 20 million dentry cache under memory pressure, preventing datanode restart performance degradation - Avoid some jumps in inode_permission() using likely()/unlikely() - Avid a memory access which is most likely a cache miss when descending into devcgroup_inode_permission() - Add fastpath predicts for stat() and fdput() - Anonymous inodes currently don't come with a proper mode causing issues in the kernel when we want to add useful VFS debug assert. Fix that by giving them a proper mode and masking it off when we report it to userspace which relies on them not having any mode - Anonymous inodes currently allow to change inode attributes because the VFS falls back to simple_setattr() if i_op->setattr isn't implemented. This means the ownership and mode for every single user of anon_inode_inode can be changed. Block that as it's either useless or actively harmful. If specific ownership is needed the respective subsystem should allocate anonymous inodes from their own private superblock - Raise SB_I_NODEV and SB_I_NOEXEC on the anonymous inode superblock - Add proper tests for anonymous inode behavior - Make it easy to detect proper anonymous inodes and to ensure that we can detect them in codepaths such as readahead() Cleanups: - Port pidfs to the new anon_inode_{g,s}etattr() helpers - Try to remove the uselib() system call - Add unlikely branch hint return path for poll - Add unlikely branch hint on return path for core_sys_select - Don't allow signals to interrupt getdents copying for fuse - Provide a size hint to dir_context for during readdir() - Use writeback_iter directly in mpage_writepages - Update compression and mtime descriptions in initramfs documentation - Update main netfs API document - Remove useless plus one in super_cache_scan() - Remove unnecessary NULL-check guards during setns() - Add separate separate {get,put}_cgroup_ns no-op cases Fixes: - Fix typo in root= kernel parameter description - Use KERN_INFO for infof()|info_plog()|infofc() - Correct comments of fs_validate_description() - Mark an unlikely if condition with unlikely() in vfs_parse_monolithic_sep() - Delete macro fsparam_u32hex() - Remove unused and problematic validate_constant_table() - Fix potential unsigned integer underflow in fs_name() - Make file-nr output the total allocated file handles" * tag 'vfs-6.16-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (43 commits) fs: Pass a folio to page_put_link() nfs: Use a folio in nfs_get_link() fs: Convert __page_get_link() to use a folio fs/read_write: make default_llseek() killable fs/open: make do_truncate() killable fs/open: make chmod_common() and chown_common() killable include/linux/fs.h: add inode_lock_killable() readdir: supply dir_context.count as readdir buffer size hint vfs: Add sysctl vfs_cache_pressure_denom for bulk file operations fuse: don't allow signals to interrupt getdents copying Documentation: fix typo in root= kernel parameter description include/cgroup: separate {get,put}_cgroup_ns no-op case kernel/nsproxy: remove unnecessary guards fs: use writeback_iter directly in mpage_writepages fs: remove useless plus one in super_cache_scan() fs: add S_ANON_INODE fs: remove uselib() system call device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission() fs/fs_parse: Remove unused and problematic validate_constant_table() fs: touch up predicts in inode_permission() ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/binfmts.h1
-rw-r--r--include/linux/cgroup.h26
-rw-r--r--include/linux/device_cgroup.h7
-rw-r--r--include/linux/file.h2
-rw-r--r--include/linux/fs.h22
-rw-r--r--include/linux/fs_parser.h7
6 files changed, 41 insertions, 24 deletions
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 1625c8529e70..65abd5ab8836 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,7 +90,6 @@ struct linux_binfmt {
struct list_head lh;
struct module *module;
int (*load_binary)(struct linux_binprm *);
- int (*load_shlib)(struct file *);
#ifdef CONFIG_COREDUMP
int (*core_dump)(struct coredump_params *cprm);
unsigned long min_coredump; /* minimal dump size */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e7da3c3b098b..166d6de50dbf 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -785,6 +785,17 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+static inline void get_cgroup_ns(struct cgroup_namespace *ns)
+{
+ refcount_inc(&ns->ns.count);
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+ if (refcount_dec_and_test(&ns->ns.count))
+ free_cgroup_ns(ns);
+}
+
#else /* !CONFIG_CGROUPS */
static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
@@ -795,19 +806,10 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
return old_ns;
}
-#endif /* !CONFIG_CGROUPS */
+static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
+static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
-static inline void get_cgroup_ns(struct cgroup_namespace *ns)
-{
- if (ns)
- refcount_inc(&ns->ns.count);
-}
-
-static inline void put_cgroup_ns(struct cgroup_namespace *ns)
-{
- if (ns && refcount_dec_and_test(&ns->ns.count))
- free_cgroup_ns(ns);
-}
+#endif /* !CONFIG_CGROUPS */
#ifdef CONFIG_CGROUPS
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index d02f32b7514e..0864773a57e8 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -18,15 +18,16 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
short type, access = 0;
+ if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)))
+ return 0;
+
if (likely(!inode->i_rdev))
return 0;
if (S_ISBLK(inode->i_mode))
type = DEVCG_DEV_BLOCK;
- else if (S_ISCHR(inode->i_mode))
+ else /* S_ISCHR by the test above */
type = DEVCG_DEV_CHAR;
- else
- return 0;
if (mask & MAY_WRITE)
access |= DEVCG_ACC_WRITE;
diff --git a/include/linux/file.h b/include/linux/file.h
index 302f11355b10..af1768d934a0 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -59,7 +59,7 @@ static inline struct fd CLONED_FD(struct file *f)
static inline void fdput(struct fd fd)
{
- if (fd.word & FDPUT_FPUT)
+ if (unlikely(fd.word & FDPUT_FPUT))
fput(fd_file(fd));
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9596df560c0f..9120fd0759cd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -866,6 +866,11 @@ static inline void inode_lock(struct inode *inode)
down_write(&inode->i_rwsem);
}
+static inline __must_check int inode_lock_killable(struct inode *inode)
+{
+ return down_write_killable(&inode->i_rwsem);
+}
+
static inline void inode_unlock(struct inode *inode)
{
up_write(&inode->i_rwsem);
@@ -876,6 +881,11 @@ static inline void inode_lock_shared(struct inode *inode)
down_read(&inode->i_rwsem);
}
+static inline __must_check int inode_lock_shared_killable(struct inode *inode)
+{
+ return down_read_killable(&inode->i_rwsem);
+}
+
static inline void inode_unlock_shared(struct inode *inode)
{
up_read(&inode->i_rwsem);
@@ -2070,8 +2080,18 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
struct dir_context {
filldir_t actor;
loff_t pos;
+ /*
+ * Filesystems MUST NOT MODIFY count, but may use as a hint:
+ * 0 unknown
+ * > 0 space in buffer (assume at least one entry)
+ * INT_MAX unlimited
+ */
+ int count;
};
+/* If OR-ed with d_type, pending signals are not checked */
+#define FILLDIR_FLAG_NOINTR 0x1000
+
/*
* These flags let !MMU mmap() govern direct device mapping vs immediate
* copying more easily for MAP_PRIVATE, especially for ROM filesystems.
@@ -2343,6 +2363,7 @@ struct super_operations {
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
+#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2399,6 +2420,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
+#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
struct inode *inode)
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 53e566efd5fd..5a0e897cae80 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -87,14 +87,9 @@ extern int lookup_constant(const struct constant_table tbl[], const char *name,
extern const struct constant_table bool_names[];
#ifdef CONFIG_VALIDATE_FS_PARSER
-extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
- int low, int high, int special);
extern bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc);
#else
-static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
- int low, int high, int special)
-{ return true; }
static inline bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc)
{ return true; }
@@ -125,8 +120,6 @@ static inline bool fs_validate_description(const char *name,
#define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL)
#define fsparam_u32oct(NAME, OPT) \
__fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8)
-#define fsparam_u32hex(NAME, OPT) \
- __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16)
#define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL)
#define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL)
#define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array)