summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMateusz Guzik <mjguzik@gmail.com>2026-01-20 19:45:39 +0100
committerChristian Brauner <brauner@kernel.org>2026-02-10 11:39:30 +0100
commit87caaeef79950377b616f3ba2265a82742cb9583 (patch)
tree3fbb4b2422a16c5eb727ad5db23bb1df39d64842
parent03aef0602f22f30aab0e42e7f3169b0a5920c461 (diff)
pidfs: implement ino allocation without the pidmap lock
This paves the way for scalable PID allocation later. The 32 bit variant merely takes a spinlock for simplicity, the 64 bit variant uses a scalable scheme. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com Co-developed-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--fs/pidfs.c113
-rw-r--r--kernel/pid.c3
2 files changed, 73 insertions, 43 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c
index ee0e36dd29d2..b984d0e95734 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -23,6 +23,7 @@
#include <linux/coredump.h>
#include <linux/rhashtable.h>
#include <linux/xattr.h>
+#include <linux/cookie.h>
#include "internal.h"
#include "mount.h"
@@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
.automatic_shrinking = true,
};
+/*
+ * inode number handling
+ *
+ * On 64 bit nothing special happens. The 64bit number assigned
+ * to struct pid is the inode number.
+ *
+ * On 32 bit the 64 bit number assigned to struct pid is split
+ * into two 32 bit numbers. The lower 32 bits are used as the
+ * inode number and the upper 32 bits are used as the inode
+ * generation number.
+ *
+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
+ * pidfs_ino() returns zero a wrap around happened. When a
+ * wraparound happens the 64 bit number will be incremented by 1
+ * so inode numbering starts at 1 again.
+ *
+ * On 64 bit comparing two pidfds is as simple as comparing
+ * inode numbers.
+ *
+ * When a wraparound happens on 32 bit multiple pidfds with the
+ * same inode number are likely to exist (This isn't a problem
+ * since before pidfs pidfds used the anonymous inode meaning
+ * all pidfds had the same inode number.). Userspace can
+ * reconstruct the 64 bit identifier by retrieving both the
+ * inode number and the inode generation number to compare or
+ * use file handles.
+ */
+
#if BITS_PER_LONG == 32
+
+DEFINE_SPINLOCK(pidfs_ino_lock);
+static u64 pidfs_ino_nr = 1;
+
static inline unsigned long pidfs_ino(u64 ino)
{
return lower_32_bits(ino);
@@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
return upper_32_bits(ino);
}
+static inline u64 pidfs_alloc_ino(void)
+{
+ u64 ino;
+
+ spin_lock(&pidfs_ino_lock);
+ if (pidfs_ino(pidfs_ino_nr) == 0)
+ pidfs_ino_nr++;
+ ino = pidfs_ino_nr++;
+ spin_unlock(&pidfs_ino_lock);
+ return ino;
+}
+
#else
/* On 64 bit simply return ino. */
@@ -90,61 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
{
return 0;
}
-#endif
-/*
- * Allocate inode number and initialize pidfs fields.
- * Called with pidmap_lock held.
- */
-void pidfs_prepare_pid(struct pid *pid)
+DEFINE_COOKIE(pidfs_ino_cookie);
+
+static u64 pidfs_alloc_ino(void)
{
- static u64 pidfs_ino_nr = 2;
+ u64 ino;
- /*
- * On 64 bit nothing special happens. The 64bit number assigned
- * to struct pid is the inode number.
- *
- * On 32 bit the 64 bit number assigned to struct pid is split
- * into two 32 bit numbers. The lower 32 bits are used as the
- * inode number and the upper 32 bits are used as the inode
- * generation number.
- *
- * On 32 bit pidfs_ino() will return the lower 32 bit. When
- * pidfs_ino() returns zero a wrap around happened. When a
- * wraparound happens the 64 bit number will be incremented by 2
- * so inode numbering starts at 2 again.
- *
- * On 64 bit comparing two pidfds is as simple as comparing
- * inode numbers.
- *
- * When a wraparound happens on 32 bit multiple pidfds with the
- * same inode number are likely to exist (This isn't a problem
- * since before pidfs pidfds used the anonymous inode meaning
- * all pidfds had the same inode number.). Userspace can
- * reconstruct the 64 bit identifier by retrieving both the
- * inode number and the inode generation number to compare or
- * use file handles.
- */
- if (pidfs_ino(pidfs_ino_nr) == 0)
- pidfs_ino_nr += 2;
+ preempt_disable();
+ ino = gen_cookie_next(&pidfs_ino_cookie);
+ preempt_enable();
+
+ VFS_WARN_ON_ONCE(ino < 1);
+ return ino;
+}
+
+#endif
- pid->ino = pidfs_ino_nr;
- pid->pidfs_hash.next = NULL;
+void pidfs_prepare_pid(struct pid *pid)
+{
pid->stashed = NULL;
pid->attr = NULL;
- pidfs_ino_nr++;
+ pid->ino = 0;
}
int pidfs_add_pid(struct pid *pid)
{
- return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
- pidfs_ino_ht_params);
+ int ret;
+
+ pid->ino = pidfs_alloc_ino();
+ ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+ pidfs_ino_ht_params);
+ if (unlikely(ret))
+ pid->ino = 0;
+ return ret;
}
void pidfs_remove_pid(struct pid *pid)
{
- rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
- pidfs_ino_ht_params);
+ if (likely(pid->ino))
+ rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+ pidfs_ino_ht_params);
}
void pidfs_free_pid(struct pid *pid)
diff --git a/kernel/pid.c b/kernel/pid.c
index 06356e40ac00..72c9372b84b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
+ pidfs_prepare_pid(pid);
/*
* 2. perm check checkpoint_restore_ns_capable()
@@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
retval = -ENOMEM;
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
goto out_free;
- pidfs_prepare_pid(pid);
-
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);