From 802182490445f6bcf5de0e0518fb967c2afb6da1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 20 Jan 2026 15:52:35 +0100 Subject: pidfs: convert rb-tree to rhashtable Mateusz reported performance penalties [1] during task creation because pidfs uses pidmap_lock to add elements into the rbtree. Switch to an rhashtable to have separate fine-grained locking and to decouple from pidmap_lock moving all heavy manipulations outside of it. Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount protection to an rhashtable. This removes the global pidmap_lock contention from pidfs_ino_get_pid() lookups and allows the hashtable insert to happen outside the pidmap_lock. pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and initializes pid fields and is called inside pidmap_lock. pidfs_add_pid() inserts pid into rhashtable and is called outside pidmap_lock. Insertion into the rhashtable can fail and memory allocation may happen so we need to drop the spinlock. To guard against accidently opening an already reaped task pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or it already went through pidfs_exit() aka the process as already reaped. If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether the task has exited. This slightly changes visibility semantics: pidfd creation is denied after pidfs_exit() runs, which is just before the pid number is removed from the via free_pid(). That should not be an issue though. Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1] Link: https://patch.msgid.link/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/pid.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index f45ae56db7da..06356e40ac00 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include @@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = { EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -141,9 +139,9 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } - pidfs_remove_pid(pid); spin_unlock(&pidmap_lock); + pidfs_remove_pid(pid); call_rcu(&pid->rcu, delayed_put_pid); } @@ -316,7 +314,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retval = -ENOMEM; if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) goto out_free; - pidfs_add_pid(pid); + pidfs_prepare_pid(pid); + for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); @@ -326,6 +325,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, idr_preload_end(); ns_ref_active_get(ns); + retval = pidfs_add_pid(pid); + if (unlikely(retval)) { + free_pid(pid); + pid = ERR_PTR(-ENOMEM); + } + return pid; out_free: -- cgit v1.2.3