diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-16 12:37:13 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-16 12:37:13 -0800 |
| commit | 543b9b63394ee67ecf5298fe42cbe65b21a16eac (patch) | |
| tree | ddae1ef25280398551cf0077799e0a51dc3363b4 | |
| parent | 57d76ceccee4b497eb835831206b50e72915a501 (diff) | |
| parent | 3673dd3c7dc1f37baf0448164d323d7c7a44d1da (diff) | |
Merge tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull pidfs updates from Christian Brauner:
- pid: introduce task_ppid_vnr() helper
- pidfs: convert rb-tree to rhashtable
Mateusz reported performance penalties during task creation because
pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
rhashtable to have separate fine-grained locking and to decouple from
pidmap_lock moving all heavy manipulations outside of it
Also move inode allocation outside of pidmap_lock. With this there's
nothing happening for pidfs under pidmap_lock
- pid: reorder fields in pid_namespace to reduce false sharing
- Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie
callers"
- ipc: Add SPDX license id to mqueue.c
* tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
pid: introduce task_ppid_vnr() helper
pidfs: implement ino allocation without the pidmap lock
Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers"
pid: reorder fields in pid_namespace to reduce false sharing
pidfs: convert rb-tree to rhashtable
ipc: Add SPDX license id to mqueue.c
| -rw-r--r-- | fs/pidfs.c | 174 | ||||
| -rw-r--r-- | include/linux/pid.h | 9 | ||||
| -rw-r--r-- | include/linux/pid_namespace.h | 14 | ||||
| -rw-r--r-- | include/linux/pidfs.h | 3 | ||||
| -rw-r--r-- | ipc/mqueue.c | 3 | ||||
| -rw-r--r-- | kernel/pid.c | 15 |
6 files changed, 121 insertions, 97 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c index 1e20e36e0ed5..3ffa5e4707de 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -21,7 +21,9 @@ #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/coredump.h> +#include <linux/rhashtable.h> #include <linux/xattr.h> +#include <linux/cookie.h> #include "internal.h" #include "mount.h" @@ -55,9 +57,48 @@ struct pidfs_attr { __u32 coredump_signal; }; -static struct rb_root pidfs_ino_tree = RB_ROOT; +static struct rhashtable pidfs_ino_ht; + +static const struct rhashtable_params pidfs_ino_ht_params = { + .key_offset = offsetof(struct pid, ino), + .key_len = sizeof(u64), + .head_offset = offsetof(struct pid, pidfs_hash), + .automatic_shrinking = true, +}; + +/* + * inode number handling + * + * On 64 bit nothing special happens. The 64bit number assigned + * to struct pid is the inode number. + * + * On 32 bit the 64 bit number assigned to struct pid is split + * into two 32 bit numbers. The lower 32 bits are used as the + * inode number and the upper 32 bits are used as the inode + * generation number. + * + * On 32 bit pidfs_ino() will return the lower 32 bit. When + * pidfs_ino() returns zero a wrap around happened. When a + * wraparound happens the 64 bit number will be incremented by 1 + * so inode numbering starts at 1 again. + * + * On 64 bit comparing two pidfds is as simple as comparing + * inode numbers. + * + * When a wraparound happens on 32 bit multiple pidfds with the + * same inode number are likely to exist (This isn't a problem + * since before pidfs pidfds used the anonymous inode meaning + * all pidfds had the same inode number.). Userspace can + * reconstruct the 64 bit identifier by retrieving both the + * inode number and the inode generation number to compare or + * use file handles. + */ #if BITS_PER_LONG == 32 + +DEFINE_SPINLOCK(pidfs_ino_lock); +static u64 pidfs_ino_nr = 1; + static inline unsigned long pidfs_ino(u64 ino) { return lower_32_bits(ino); @@ -69,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino) return upper_32_bits(ino); } +static inline u64 pidfs_alloc_ino(void) +{ + u64 ino; + + spin_lock(&pidfs_ino_lock); + if (pidfs_ino(pidfs_ino_nr) == 0) + pidfs_ino_nr++; + ino = pidfs_ino_nr++; + spin_unlock(&pidfs_ino_lock); + return ino; +} + #else /* On 64 bit simply return ino. */ @@ -82,69 +135,47 @@ static inline u32 pidfs_gen(u64 ino) { return 0; } -#endif -static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); - struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); - u64 pid_ino_a = pid_a->ino; - u64 pid_ino_b = pid_b->ino; - - if (pid_ino_a < pid_ino_b) - return -1; - if (pid_ino_a > pid_ino_b) - return 1; - return 0; -} +DEFINE_COOKIE(pidfs_ino_cookie); -void pidfs_add_pid(struct pid *pid) +static u64 pidfs_alloc_ino(void) { - static u64 pidfs_ino_nr = 2; + u64 ino; - /* - * On 64 bit nothing special happens. The 64bit number assigned - * to struct pid is the inode number. - * - * On 32 bit the 64 bit number assigned to struct pid is split - * into two 32 bit numbers. The lower 32 bits are used as the - * inode number and the upper 32 bits are used as the inode - * generation number. - * - * On 32 bit pidfs_ino() will return the lower 32 bit. When - * pidfs_ino() returns zero a wrap around happened. When a - * wraparound happens the 64 bit number will be incremented by 2 - * so inode numbering starts at 2 again. - * - * On 64 bit comparing two pidfds is as simple as comparing - * inode numbers. - * - * When a wraparound happens on 32 bit multiple pidfds with the - * same inode number are likely to exist (This isn't a problem - * since before pidfs pidfds used the anonymous inode meaning - * all pidfds had the same inode number.). Userspace can - * reconstruct the 64 bit identifier by retrieving both the - * inode number and the inode generation number to compare or - * use file handles. - */ - if (pidfs_ino(pidfs_ino_nr) == 0) - pidfs_ino_nr += 2; + preempt_disable(); + ino = gen_cookie_next(&pidfs_ino_cookie); + preempt_enable(); - pid->ino = pidfs_ino_nr; + VFS_WARN_ON_ONCE(ino < 1); + return ino; +} + +#endif + +void pidfs_prepare_pid(struct pid *pid) +{ pid->stashed = NULL; pid->attr = NULL; - pidfs_ino_nr++; + pid->ino = 0; +} - write_seqcount_begin(&pidmap_lock_seq); - rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); - write_seqcount_end(&pidmap_lock_seq); +int pidfs_add_pid(struct pid *pid) +{ + int ret; + + pid->ino = pidfs_alloc_ino(); + ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, + pidfs_ino_ht_params); + if (unlikely(ret)) + pid->ino = 0; + return ret; } void pidfs_remove_pid(struct pid *pid) { - write_seqcount_begin(&pidmap_lock_seq); - rb_erase(&pid->pidfs_node, &pidfs_ino_tree); - write_seqcount_end(&pidmap_lock_seq); + if (likely(pid->ino)) + rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, + pidfs_ino_ht_params); } void pidfs_free_pid(struct pid *pid) @@ -415,7 +446,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) * the fields are set correctly, or return ESRCH to avoid providing * incomplete information. */ - kinfo.ppid = task_ppid_nr_ns(task, NULL); + kinfo.ppid = task_ppid_vnr(task); kinfo.tgid = task_tgid_vnr(task); kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; @@ -791,42 +822,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return FILEID_KERNFS; } -static int pidfs_ino_find(const void *key, const struct rb_node *node) -{ - const u64 pid_ino = *(u64 *)key; - const struct pid *pid = rb_entry(node, struct pid, pidfs_node); - - if (pid_ino < pid->ino) - return -1; - if (pid_ino > pid->ino) - return 1; - return 0; -} - /* Find a struct pid based on the inode number. */ static struct pid *pidfs_ino_get_pid(u64 ino) { struct pid *pid; - struct rb_node *node; - unsigned int seq; + struct pidfs_attr *attr; guard(rcu)(); - do { - seq = read_seqcount_begin(&pidmap_lock_seq); - node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); - if (node) - break; - } while (read_seqcount_retry(&pidmap_lock_seq, seq)); - - if (!node) + pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params); + if (!pid) + return NULL; + attr = READ_ONCE(pid->attr); + if (IS_ERR_OR_NULL(attr)) + return NULL; + if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) return NULL; - - pid = rb_entry(node, struct pid, pidfs_node); - /* Within our pid namespace hierarchy? */ if (pid_vnr(pid) == 0) return NULL; - return get_pid(pid); } @@ -1104,6 +1117,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) void __init pidfs_init(void) { + if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params)) + panic("Failed to initialize pidfs hashtable"); + pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT | SLAB_PANIC), NULL); diff --git a/include/linux/pid.h b/include/linux/pid.h index 003a1027d219..ddaef0bbc8ba 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -6,6 +6,7 @@ #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/refcount.h> +#include <linux/rhashtable-types.h> #include <linux/sched.h> #include <linux/wait.h> @@ -60,7 +61,7 @@ struct pid { spinlock_t lock; struct { u64 ino; - struct rb_node pidfs_node; + struct rhash_head pidfs_hash; struct dentry *stashed; struct pidfs_attr *attr; }; @@ -73,7 +74,6 @@ struct pid { struct upid numbers[]; }; -extern seqcount_spinlock_t pidmap_lock_seq; extern struct pid init_struct_pid; struct file; @@ -310,6 +310,11 @@ static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_na return pid; } +static inline pid_t task_ppid_vnr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, NULL); +} + static inline pid_t task_ppid_nr(const struct task_struct *tsk) { return task_ppid_nr_ns(tsk, &init_pid_ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 0e7ae12c96d2..b20baaa7e62b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -27,6 +27,13 @@ struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; +#ifdef CONFIG_SYSCTL +#if defined(CONFIG_MEMFD_CREATE) + int memfd_noexec_scope; +#endif + struct ctl_table_set set; + struct ctl_table_header *sysctls; +#endif struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -40,13 +47,6 @@ struct pid_namespace { int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; struct work_struct work; -#ifdef CONFIG_SYSCTL - struct ctl_table_set set; - struct ctl_table_header *sysctls; -#if defined(CONFIG_MEMFD_CREATE) - int memfd_noexec_scope; -#endif -#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 3e08c33da2df..416bdff4d6ce 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,7 +6,8 @@ struct coredump_params; struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); -void pidfs_add_pid(struct pid *pid); +void pidfs_prepare_pid(struct pid *pid); +int pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); void pidfs_exit(struct task_struct *tsk); #ifdef CONFIG_COREDUMP diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 53a58f9ba01f..bb7c9e5d2b90 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * POSIX message queues filesystem for Linux. * @@ -9,8 +10,6 @@ * Manfred Spraul (manfred@colorfullife.com) * * Audit: George Wilson (ltcgcw@us.ibm.com) - * - * This file is released under the GPL. */ #include <linux/capability.h> diff --git a/kernel/pid.c b/kernel/pid.c index f45ae56db7da..3b96571d0fe6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,7 +43,6 @@ #include <linux/sched/task.h> #include <linux/idr.h> #include <linux/pidfs.h> -#include <linux/seqlock.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> @@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = { EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -141,9 +139,9 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } - pidfs_remove_pid(pid); spin_unlock(&pidmap_lock); + pidfs_remove_pid(pid); call_rcu(&pid->rcu, delayed_put_pid); } @@ -200,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); + pidfs_prepare_pid(pid); /* * 2. perm check checkpoint_restore_ns_capable() @@ -316,7 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retval = -ENOMEM; if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) goto out_free; - pidfs_add_pid(pid); for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); @@ -326,6 +324,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, idr_preload_end(); ns_ref_active_get(ns); + retval = pidfs_add_pid(pid); + if (unlikely(retval)) { + free_pid(pid); + pid = ERR_PTR(-ENOMEM); + } + return pid; out_free: @@ -554,8 +558,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (ns) - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; |
