summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-24 10:16:37 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-24 10:16:37 -0700
commitdf00ded23a6b4df888237333b1f86067d24113b2 (patch)
tree36591a7b9cd112c6525e8fa20fdf49c8011113f0 /kernel
parent71ee2fde57c707ac8f221321f3e951288f00f04b (diff)
parentd40dc30c7b7c80db2100b73ac26d39c362643a39 (diff)
Merge tag 'vfs-6.15-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs pidfs updates from Christian Brauner: - Allow retrieving exit information after a process has been reaped through pidfds via the new PIDFD_INTO_EXIT extension for the PIDFD_GET_INFO ioctl. Various tools need access to information about a process/task even after it has already been reaped. Pidfd polling allows waiting on either task exit or for a task to have been reaped. The contract for PIDFD_INFO_EXIT is simply that EPOLLHUP must be observed before exit information can be retrieved, i.e., exit information is only provided once the task has been reaped and then can be retrieved as long as the pidfd is open. - Add PIDFD_SELF_{THREAD,THREAD_GROUP} sentinels allowing userspace to forgo allocating a file descriptor for their own process. This is useful in scenarios where users want to act on their own process through pidfds and is akin to AT_FDCWD. - Improve premature thread-group leader and subthread exec behavior when polling on pidfds: (1) During a multi-threaded exec by a subthread, i.e., non-thread-group leader thread, all other threads in the thread-group including the thread-group leader are killed and the struct pid of the thread-group leader will be taken over by the subthread that called exec. IOW, two tasks change their TIDs. (2) A premature thread-group leader exit means that the thread-group leader exited before all of the other subthreads in the thread-group have exited. Both cases lead to inconsistencies for pidfd polling with PIDFD_THREAD. Any caller that holds a PIDFD_THREAD pidfd to the current thread-group leader may or may not see an exit notification on the file descriptor depending on when poll is performed. If the poll is performed before the exec of the subthread has concluded an exit notification is generated for the old thread-group leader. If the poll is performed after the exec of the subthread has concluded no exit notification is generated for the old thread-group leader. The correct behavior is to simply not generate an exit notification on the struct pid of a subhthread exec because the struct pid is taken over by the subthread and thus remains alive. But this is difficult to handle because a thread-group may exit premature as mentioned in (2). In that case an exit notification is reliably generated but the subthreads may continue to run for an indeterminate amount of time and thus also may exec at some point. After this pull no exit notifications will be generated for a PIDFD_THREAD pidfd for a thread-group leader until all subthreads have been reaped. If a subthread should exec before no exit notification will be generated until that task exits or it creates subthreads and repeates the cycle. This means an exit notification indicates the ability for the father to reap the child. * tag 'vfs-6.15-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (25 commits) selftests/pidfd: third test for multi-threaded exec polling selftests/pidfd: second test for multi-threaded exec polling selftests/pidfd: first test for multi-threaded exec polling pidfs: improve multi-threaded exec and premature thread-group leader exit polling pidfs: ensure that PIDFS_INFO_EXIT is available selftests/pidfd: add seventh PIDFD_INFO_EXIT selftest selftests/pidfd: add sixth PIDFD_INFO_EXIT selftest selftests/pidfd: add fifth PIDFD_INFO_EXIT selftest selftests/pidfd: add fourth PIDFD_INFO_EXIT selftest selftests/pidfd: add third PIDFD_INFO_EXIT selftest selftests/pidfd: add second PIDFD_INFO_EXIT selftest selftests/pidfd: add first PIDFD_INFO_EXIT selftest selftests/pidfd: expand common pidfd header pidfs/selftests: ensure correct headers for ioctl handling selftests/pidfd: fix header inclusion pidfs: allow to retrieve exit information pidfs: record exit code and cgroupid at exit pidfs: use private inode slab cache pidfs: move setting flags into pidfs_alloc_file() pidfd: rely on automatic cleanup in __pidfd_prepare() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/pid.c24
-rw-r--r--kernel/signal.c108
4 files changed, 97 insertions, 65 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 3485e5fc499e..683766316a3d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/uaccess.h>
+#include <linux/pidfs.h>
#include <uapi/linux/wait.h>
@@ -249,6 +250,7 @@ repeat:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
+ pidfs_exit(p);
cgroup_release(p);
write_lock_irq(&tasklist_lock);
@@ -741,10 +743,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = EXIT_ZOMBIE;
/*
- * sub-thread or delay_group_leader(), wake up the
- * PIDFD_THREAD waiters.
+ * Ignore thread-group leaders that exited before all
+ * subthreads did.
*/
- if (!thread_group_empty(tsk))
+ if (!delay_group_leader(tsk))
do_notify_pidfd(tsk);
if (unlikely(tsk->ptrace)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 735405a9c5f3..f11ac96b7587 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2032,25 +2032,18 @@ static inline void rcu_copy_process(struct task_struct *p)
*/
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- int pidfd;
struct file *pidfd_file;
- pidfd = get_unused_fd_flags(O_CLOEXEC);
+ CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
- if (IS_ERR(pidfd_file)) {
- put_unused_fd(pidfd);
+ if (IS_ERR(pidfd_file))
return PTR_ERR(pidfd_file);
- }
- /*
- * anon_inode_getfile() ignores everything outside of the
- * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
- */
- pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
*ret = pidfd_file;
- return pidfd;
+ return take_fd(pidfd);
}
/**
@@ -2432,8 +2425,11 @@ __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PIDFD) {
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
- /* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, flags, &pidfile);
+ /*
+ * Note that no task has been attached to @pid yet indicate
+ * that via CLONE_PIDFD.
+ */
+ retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
diff --git a/kernel/pid.c b/kernel/pid.c
index 924084713be8..22f5d2b2e290 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -564,15 +564,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
*/
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
- unsigned int f_flags;
+ unsigned int f_flags = 0;
struct pid *pid;
struct task_struct *task;
+ enum pid_type type;
- pid = pidfd_get_pid(pidfd, &f_flags);
- if (IS_ERR(pid))
- return ERR_CAST(pid);
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ type = PIDTYPE_PID;
+ pid = get_task_pid(current, type);
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ pid = get_task_pid(current, type);
+ break;
+ default:
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+ type = PIDTYPE_TGID;
+ break;
+ }
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, type);
put_pid(pid);
if (!task)
return ERR_PTR(-ESRCH);
diff --git a/kernel/signal.c b/kernel/signal.c
index 875e97f6205a..027ad9e97417 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/*
- * tsk is a group leader and has no threads, wake up the
- * non-PIDFD_THREAD waiters.
+ * Notify for thread-group leaders without subthreads.
*/
if (thread_group_empty(tsk))
do_notify_pidfd(tsk);
@@ -4009,6 +4008,47 @@ static struct pid *pidfd_to_pid(const struct file *file)
(PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
PIDFD_SIGNAL_PROCESS_GROUP)
+static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type,
+ siginfo_t __user *info, unsigned int flags)
+{
+ kernel_siginfo_t kinfo;
+
+ switch (flags) {
+ case PIDFD_SIGNAL_THREAD:
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SIGNAL_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_PROCESS_GROUP:
+ type = PIDTYPE_PGID;
+ break;
+ }
+
+ if (info) {
+ int ret;
+
+ ret = copy_siginfo_from_user_any(&kinfo, info);
+ if (unlikely(ret))
+ return ret;
+
+ if (unlikely(sig != kinfo.si_signo))
+ return -EINVAL;
+
+ /* Only allow sending arbitrary signals to yourself. */
+ if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
+ (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
+ return -EPERM;
+ } else {
+ prepare_kill_siginfo(sig, &kinfo, type);
+ }
+
+ if (type == PIDTYPE_PGID)
+ return kill_pgrp_info(sig, &kinfo, pid);
+
+ return kill_pid_info_type(sig, &kinfo, pid, type);
+}
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -4026,9 +4066,7 @@ static struct pid *pidfd_to_pid(const struct file *file)
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
- int ret;
struct pid *pid;
- kernel_siginfo_t kinfo;
enum pid_type type;
/* Enforce flags be set to 0 until we add an extension. */
@@ -4039,57 +4077,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
- CLASS(fd, f)(pidfd);
- if (fd_empty(f))
- return -EBADF;
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ pid = get_task_pid(current, PIDTYPE_PID);
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ pid = get_task_pid(current, PIDTYPE_TGID);
+ type = PIDTYPE_TGID;
+ break;
+ default: {
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
+ return -EBADF;
- /* Is this a pidfd? */
- pid = pidfd_to_pid(fd_file(f));
- if (IS_ERR(pid))
- return PTR_ERR(pid);
+ /* Is this a pidfd? */
+ pid = pidfd_to_pid(fd_file(f));
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
- if (!access_pidfd_pidns(pid))
- return -EINVAL;
+ if (!access_pidfd_pidns(pid))
+ return -EINVAL;
- switch (flags) {
- case 0:
/* Infer scope from the type of pidfd. */
if (fd_file(f)->f_flags & PIDFD_THREAD)
type = PIDTYPE_PID;
else
type = PIDTYPE_TGID;
- break;
- case PIDFD_SIGNAL_THREAD:
- type = PIDTYPE_PID;
- break;
- case PIDFD_SIGNAL_THREAD_GROUP:
- type = PIDTYPE_TGID;
- break;
- case PIDFD_SIGNAL_PROCESS_GROUP:
- type = PIDTYPE_PGID;
- break;
- }
- if (info) {
- ret = copy_siginfo_from_user_any(&kinfo, info);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(sig != kinfo.si_signo))
- return -EINVAL;
-
- /* Only allow sending arbitrary signals to yourself. */
- if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
- (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
- return -EPERM;
- } else {
- prepare_kill_siginfo(sig, &kinfo, type);
+ return do_pidfd_send_signal(pid, sig, type, info, flags);
+ }
}
- if (type == PIDTYPE_PGID)
- return kill_pgrp_info(sig, &kinfo, pid);
- else
- return kill_pid_info_type(sig, &kinfo, pid, type);
+ return do_pidfd_send_signal(pid, sig, type, info, flags);
}
static int