From 1b090949c9989a35c74aa2cd7fee6670b79019cd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:09:02 +0200 Subject: pidfd: remove unneeded NULL check from pidfd_prepare() None of the caller actually pass a NULL pid in there. Link: https://lore.kernel.org/r/20250403-work-pidfd-fixes-v1-2-a123b6ed6716@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b..182ec2e9087d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2110,7 +2110,7 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { bool thread = flags & PIDFD_THREAD; - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + if (!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); -- cgit v1.2.3 From 8cf4b738f6d84fdd8d7ff1e8d0e2298ded3e4153 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:09:03 +0200 Subject: pidfd: improve uapi when task isn't found We currently report EINVAL whenever a struct pid has no tasked attached anymore thereby conflating two concepts: (1) The task has already been reaped. (2) The caller requested a pidfd for a thread-group leader but the pid actually references a struct pid that isn't used as a thread-group leader. This is causing issues for non-threaded workloads as in [1]. This patch tries to allow userspace to distinguish between (1) and (2). This is racy of course but that shouldn't matter. Link: https://github.com/systemd/systemd/pull/36982 [1] Link: https://lore.kernel.org/r/20250403-work-pidfd-fixes-v1-3-a123b6ed6716@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/fork.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 182ec2e9087d..4a2080b968c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2108,10 +2108,27 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - bool thread = flags & PIDFD_THREAD; + int err = 0; - if (!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; + if (!(flags & PIDFD_THREAD)) { + /* + * If this is struct pid isn't used as a thread-group + * leader pid but the caller requested to create a + * thread-group leader pidfd then report ENOENT to the + * caller as a hint. + */ + if (!pid_has_task(pid, PIDTYPE_TGID)) + err = -ENOENT; + } + + /* + * If this wasn't a thread-group leader struct pid or the task + * got reaped in the meantime report -ESRCH to userspace. + */ + if (!pid_has_task(pid, PIDTYPE_PID)) + err = -ESRCH; + if (err) + return err; return __pidfd_prepare(pid, flags, ret); } -- cgit v1.2.3 From 35c9701ea717dc548f1fab5bfa286be98c1bade8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 11 Apr 2025 15:22:44 +0200 Subject: exit: move wake_up_all() pidfd waiters into __unhash_process() Move the pidfd notification out of __change_pid() and into __unhash_process(). The only valid call to __change_pid() with a NULL argument and PIDTYPE_PID is from __unhash_process(). This is a lot more obvious than calling it from __change_pid(). Link: https://lore.kernel.org/20250411-work-pidfs-enoent-v2-1-60b2d3bb545f@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/exit.c | 5 +++++ kernel/pid.c | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..abcd93ce4e18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -133,8 +133,13 @@ struct release_task_post { static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); diff --git a/kernel/pid.c b/kernel/pid.c index 4ac2ce46817f..26f1e136f017 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -359,11 +359,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; - if (type == PIDTYPE_PID) { - WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); - wake_up_all(&pid->wait_pidfd); - } - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; -- cgit v1.2.3 From 17f1b08acf50c0bfb02e21623e53e7e575612b67 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 11 Apr 2025 15:22:45 +0200 Subject: pidfs: ensure consistent ENOENT/ESRCH reporting In a prior patch series we tried to cleanly differentiate between: (1) The task has already been reaped. (2) The caller requested a pidfd for a thread-group leader but the pid actually references a struct pid that isn't used as a thread-group leader. as this was causing issues for non-threaded workloads. But there's cases where the current simple logic is wrong. Specifically, if the pid was a leader pid and the check races with __unhash_process(). Stabilize this by using the pidfd waitqueue lock. Link: https://lore.kernel.org/20250411-work-pidfs-enoent-v2-2-60b2d3bb545f@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/fork.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4a2080b968c8..f7403e1fb0d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2108,28 +2108,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - int err = 0; - - if (!(flags & PIDFD_THREAD)) { + /* + * While holding the pidfd waitqueue lock removing the task + * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't + * possible. Thus, if there's still task linkage for PIDTYPE_PID + * not having thread-group leader linkage for the pid means it + * wasn't a thread-group leader in the first place. + */ + scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; /* - * If this is struct pid isn't used as a thread-group - * leader pid but the caller requested to create a - * thread-group leader pidfd then report ENOENT to the - * caller as a hint. + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. */ - if (!pid_has_task(pid, PIDTYPE_TGID)) - err = -ENOENT; + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; } - /* - * If this wasn't a thread-group leader struct pid or the task - * got reaped in the meantime report -ESRCH to userspace. - */ - if (!pid_has_task(pid, PIDTYPE_PID)) - err = -ESRCH; - if (err) - return err; - return __pidfd_prepare(pid, flags, ret); } -- cgit v1.2.3 From 0a36bad01731e71568bdd365764d38b6bd576ab0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 Apr 2025 14:18:57 +0200 Subject: release_task: kill the no longer needed get/put_pid(thread_pid) After the commit 7903f907a2260 ("pid: perform free_pid() calls outside of tasklist_lock") __unhash_process() -> detach_pid() no longer calls free_pid(), proc_flush_pid() can just use p->thread_pid without the now pointless get_pid() + put_pid(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/20250411121857.GA10550@redhat.com Reviewed-by: Mateusz Guzik Signed-off-by: Christian Brauner --- kernel/exit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index abcd93ce4e18..c33ecde016de 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -258,7 +258,8 @@ repeat: pidfs_exit(p); cgroup_release(p); - thread_pid = get_pid(p->thread_pid); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -287,8 +288,8 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); -- cgit v1.2.3 From a71f402acd71a942e59c16270ad61dee06de6e24 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:32 +0200 Subject: pidfs: get rid of __pidfd_prepare() Fold it into pidfd_prepare() and rename PIDFD_CLONE to PIDFD_STALE to indicate that the passed pid might not have task linkage and no explicit check for that should be performed. Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-3-450a19461e75@kernel.org Reviewed-by: Oleg Nesterov Reviewed-by: David Rheinsberg Signed-off-by: Christian Brauner --- kernel/fork.c | 83 +++++++++++++++++++++-------------------------------------- 1 file changed, 29 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f7403e1fb0d4..1d95f4dae327 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2035,55 +2035,11 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -/** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - struct file *pidfd_file; - - CLASS(get_unused_fd, pidfd)(O_CLOEXEC); - if (pidfd < 0) - return pidfd; - - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) - return PTR_ERR(pidfd_file); - - *ret = pidfd_file; - return take_fd(pidfd); -} - /** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. @@ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { + struct file *pidfs_file; + /* - * While holding the pidfd waitqueue lock removing the task - * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't - * possible. Thus, if there's still task linkage for PIDTYPE_PID - * not having thread-group leader linkage for the pid means it - * wasn't a thread-group leader in the first place. + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. */ - scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + /* Task has already been reaped. */ if (!pid_has_task(pid, PIDTYPE_PID)) return -ESRCH; @@ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) return -ENOENT; } - return __pidfd_prepare(pid, flags, ret); + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); + + *ret_file = pidfs_file; + return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) @@ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process( * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; -- cgit v1.2.3 From db56723ceaec87aa5cf871e623f464934b266228 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 6 May 2025 13:55:54 +0200 Subject: pidfs: detect refcount bugs Now that we have pidfs_{get,register}_pid() that needs to be paired with pidfs_put_pid() it's possible that someone pairs them with put_pid(). Thus freeing struct pid while it's still used by pidfs. Notice when that happens. I'll also add a scheme to detect invalid uses of pidfs_get_pid() and pidfs_put_pid() later. Link: https://lore.kernel.org/20250506-uferbereich-guttun-7c8b1a0a431f@brauner Signed-off-by: Christian Brauner --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 26f1e136f017..8317bcbc7cf7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,6 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { + WARN_ON_ONCE(pid->stashed); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } -- cgit v1.2.3