From 75215c972581d3934e76a57690cf838d7ceab399 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:38 +0200 Subject: pidfs: move to anonymous struct Move the pidfs entries to an anonymous struct. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-4-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/pid.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index 453ae6d8a68d..00646a692dd4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -52,14 +52,15 @@ struct upid { struct pid_namespace *ns; }; -struct pid -{ +struct pid { refcount_t count; unsigned int level; spinlock_t lock; - struct dentry *stashed; - u64 ino; - struct rb_node pidfs_node; + struct { + u64 ino; + struct rb_node pidfs_node; + struct dentry *stashed; + }; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; -- cgit v1.2.3 From 8ec7c826d97b390879df2a03dfb035c70af86779 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:39 +0200 Subject: pidfs: persist information Persist exit and coredump information independent of whether anyone currently holds a pidfd for the struct pid. The current scheme allocated pidfs dentries on-demand repeatedly. This scheme is reaching it's limits as it makes it impossible to pin information that needs to be available after the task has exited or coredumped and that should not be lost simply because the pidfd got closed temporarily. The next opener should still see the stashed information. This is also a prerequisite for supporting extended attributes on pidfds to allow attaching meta information to them. If someone opens a pidfd for a struct pid a pidfs dentry is allocated and stashed in pid->stashed. Once the last pidfd for the struct pid is closed the pidfs dentry is released and removed from pid->stashed. So if 10 callers create a pidfs dentry for the same struct pid sequentially, i.e., each closing the pidfd before the other creates a new one then a new pidfs dentry is allocated every time. Because multiple tasks acquiring and releasing a pidfd for the same struct pid can race with each another a task may still find a valid pidfs entry from the previous task in pid->stashed and reuse it. Or it might find a dead dentry in there and fail to reuse it and so stashes a new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry but only one will succeed, the other ones will put their dentry. The current scheme aims to ensure that a pidfs dentry for a struct pid can only be created if the task is still alive or if a pidfs dentry already existed before the task was reaped and so exit information has been was stashed in the pidfs inode. That's great except that it's buggy. If a pidfs dentry is stashed in pid->stashed after pidfs_exit() but before __unhash_process() is called we will return a pidfd for a reaped task without exit information being available. The pidfds_pid_valid() check does not guard against this race as it doens't sync at all with pidfs_exit(). The pid_has_task() check might be successful simply because we're before __unhash_process() but after pidfs_exit(). Introduce a new scheme where the lifetime of information associated with a pidfs entry (coredump and exit information) isn't bound to the lifetime of the pidfs inode but the struct pid itself. The first time a pidfs dentry is allocated for a struct pid a struct pidfs_attr will be allocated which will be used to store exit and coredump information. If all pidfs for the pidfs dentry are closed the dentry and inode can be cleaned up but the struct pidfs_attr will stick until the struct pid itself is freed. This will ensure minimal memory usage while persisting relevant information. The new scheme has various advantages. First, it allows to close the race where we end up handing out a pidfd for a reaped task for which no exit information is available. Second, it minimizes memory usage. Third, it allows to remove complex lifetime tracking via dentries when registering a struct pid with pidfs. There's no need to get or put a reference. Instead, the lifetime of exit and coredump information associated with a struct pid is bound to the lifetime of struct pid itself. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 +++ include/linux/pidfs.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index 00646a692dd4..003a1027d219 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -47,6 +47,8 @@ #define RESERVED_PIDS 300 +struct pidfs_attr; + struct upid { int nr; struct pid_namespace *ns; @@ -60,6 +62,7 @@ struct pid { u64 ino; struct rb_node pidfs_node; struct dentry *stashed; + struct pidfs_attr *attr; }; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 77e7db194914..8f6ed59bb3fb 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); void pidfs_get_pid(struct pid *pid); void pidfs_put_pid(struct pid *pid); +void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From 804d6794497e6f3992d156e07d01e22b037ce09e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:42 +0200 Subject: pidfs: remove pidfs_{get,put}_pid() Now that we stash persistent information in struct pid there's no need to play volatile games with pinning struct pid via dentries in pidfs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-8-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 8f6ed59bb3fb..3e08c33da2df 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -14,8 +14,6 @@ void pidfs_coredump(const struct coredump_params *cprm); #endif extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); -void pidfs_get_pid(struct pid *pid); -void pidfs_put_pid(struct pid *pid); void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From a4c746f06853f91d3759ae8aca514d135b6aa56d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 15:48:42 +0200 Subject: uapi/fcntl: mark range as reserved Mark the range from -10000 to -40000 as a range reserved for special in-kernel values. Move the PIDFD_SELF_*/PIDFD_THREAD_* sentinels over so all the special values are in one place. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-6-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 16 ++++++++++++++++ include/uapi/linux/pidfd.h | 15 --------------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a15ac2fa4b20..ed02d8ae0667 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -90,10 +90,26 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* Reserved kernel ranges [-100], [-10000, -40000]. */ #define AT_FDCWD -100 /* Special value for dirfd used to indicate openat should use the current working directory. */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ + /* Generic flags for the *at(2) family of syscalls. */ diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index c27a4e238e4b..957db425d459 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -42,21 +42,6 @@ #define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */ #define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */ -/* - * The concept of process and threads in userland and the kernel is a confusing - * one - within the kernel every thread is a 'task' with its own individual PID, - * however from userland's point of view threads are grouped by a single PID, - * which is that of the 'thread group leader', typically the first thread - * spawned. - * - * To cut the Gideon knot, for internal kernel usage, we refer to - * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel - * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread - * group leader... - */ -#define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ - /* * ...and for userland we make life simpler - PIDFD_SELF refers to the current * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. -- cgit v1.2.3 From 67fcec2919e4ed31ab845eb456ad7d6f1e85505c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 15:48:49 +0200 Subject: fcntl/pidfd: redefine PIDFD_SELF_THREAD_GROUP Don't jump somewhere into the middle of the reserved range. We're still able to change that value it won't be that widely used yet. If not, we can revert. Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ed02d8ae0667..ba4a698d2f33 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -108,7 +108,7 @@ * group leader... */ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From cd5d2006327b6d8488612cb8c03ad7304417c8f2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:10 +0200 Subject: uapi/fcntl: add FD_INVALID Add a marker for an invalid file descriptor. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-7-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ba4a698d2f33..a5bebe7c4400 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -110,6 +110,7 @@ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ +#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From 3941e37f62fe2c3c8b8675c12183185f20450539 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 16:57:51 +0200 Subject: uapi/fcntl: add FD_PIDFS_ROOT Add a special file descriptor indicating the root of the pidfs filesystem. Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a5bebe7c4400..f291ab4f94eb 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -110,6 +110,7 @@ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ +#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From 2b9996417e4ec231c91818f9ea8107ae62ef75ad Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:08 +0200 Subject: af_unix/scm: fix whitespace errors Fix whitespace/formatting errors. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-5-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- include/net/scm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/scm.h b/include/net/scm.h index 84c4707e78a5..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) -- cgit v1.2.3 From a683a5b2ba23598ad343e5ec10a4ef4077497fc9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Jul 2025 06:34:37 +0100 Subject: fold fs_struct->{lock,seq} into a seqlock The combination of spinlock_t lock and seqcount_spinlock_t seq in struct fs_struct is an open-coded seqlock_t (see linux/seqlock_types.h). Combine and switch to equivalent seqlock_t primitives. AFAICS, that does end up with the same sequence of underlying operations in all cases. While we are at it, get_fs_pwd() is open-coded verbatim in get_path_from_fd(); rather than applying conversion to it, replace with the call of get_fs_pwd() there. Not worth splitting the commit for that, IMO... A bit of historical background - conversion of seqlock_t to use of seqcount_spinlock_t happened several months after the same had been done to struct fs_struct; switching fs_struct to seqlock_t could've been done immediately after that, but it looks like nobody had gotten around to that until now. Signed-off-by: Al Viro Link: https://lore.kernel.org/20250702053437.GC1880847@ZenIV Acked-by: Ahmed S. Darwish Acked-by: Peter Zijlstra (Intel) Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- include/linux/fs_struct.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 783b48dedb72..baf200ab5c77 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -8,8 +8,7 @@ struct fs_struct { int users; - spinlock_t lock; - seqcount_spinlock_t seq; + seqlock_t seq; int umask; int in_exec; struct path root, pwd; @@ -26,18 +25,18 @@ extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); *root = fs->root; path_get(root); - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); *pwd = fs->pwd; path_get(pwd); - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } extern bool current_chrooted(void); -- cgit v1.2.3