summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 13:27:11 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 13:27:11 -0700
commit07c3ef58223e2c75ea209d8c416b976ec30d9413 (patch)
tree3838ec64af74ba2d876bef4f7c3c3b2709bdec95 /kernel
parentdc0dfa73381bc8b2ebd298face5dbe7e240cd80c (diff)
parentd29eb5f0ce674cfe71b93f8ff67dc0f66e6a9371 (diff)
Merge tag 'vfs-7.1-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull clone and pidfs updates from Christian Brauner: "Add three new clone3() flags for pidfd-based process lifecycle management. CLONE_AUTOREAP: CLONE_AUTOREAP makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. This is cleaner than forcing the subreaper to get SIGCHLD and then reaping it. If the parent doesn't care the subreaper won't care. If there's a subreaper that would care it would be easy enough to add a prctl() that either just turns back on SIGCHLD and turns off auto-reaping or a prctl() that just notifies the subreaper whenever a child is reparented to it. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern. No exit signal is delivered so exit_signal must be zero. CLONE_THREAD and CLONE_PARENT are rejected: CLONE_THREAD because autoreap is a process-level property, and CLONE_PARENT because an autoreap child reparented via CLONE_PARENT could become an invisible zombie under a parent that never calls wait(). The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. CLONE_NNP: CLONE_NNP sets no_new_privs on the child at clone time. Unlike prctl(PR_SET_NO_NEW_PRIVS) which a process sets on itself, CLONE_NNP allows the parent to impose no_new_privs on the child at creation without affecting the parent's own privileges. CLONE_THREAD is rejected because threads share credentials. CLONE_NNP is useful on its own for any spawn-and-sandbox pattern but was specifically introduced to enable unprivileged usage of CLONE_PIDFD_AUTOKILL. CLONE_PIDFD_AUTOKILL: This flag ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd or just wants a throwaway helper process. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD and CLONE_AUTOREAP. It requires CLONE_PIDFD because the whole point is tying the child's lifetime to the pidfd. It requires CLONE_AUTOREAP because a killed child with no one to reap it would become a zombie - the primary use case is the parent crashing or abandoning the pidfd so no one is around to call waitpid(). CLONE_THREAD is rejected because autokill targets a process not a thread. If CLONE_NNP is specified together with CLONE_PIDFD_AUTOKILL an unprivileged user may spawn a process that is autokilled. The child cannot escalate privileges via setuid/setgid exec after being spawned. If CLONE_PIDFD_AUTOKILL is specified without CLONE_NNP the caller must have have CAP_SYS_ADMIN in its user namespace" * tag 'vfs-7.1-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests: check pidfd_info->coredump_code correctness pidfds: add coredump_code field to pidfd_info kselftest/coredump: reintroduce null pointer dereference selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests selftests/pidfd: add CLONE_NNP tests selftests/pidfd: add CLONE_AUTOREAP tests pidfd: add CLONE_PIDFD_AUTOKILL clone: add CLONE_NNP clone: add CLONE_AUTOREAP
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c52
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/signal.c4
3 files changed, 55 insertions, 4 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index a4ec2d1e25ee..55a6906d3014 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2029,6 +2029,41 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_AUTOREAP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_PARENT)
+ return ERR_PTR(-EINVAL);
+ if (args->exit_signal)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((clone_flags & CLONE_PARENT) && current->signal->autoreap)
+ return ERR_PTR(-EINVAL);
+
+ if (clone_flags & CLONE_NNP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ /*
+ * Without CLONE_NNP the child could escalate privileges
+ * after being spawned, so require CAP_SYS_ADMIN.
+ * With CLONE_NNP the child can't gain new privileges,
+ * so allow unprivileged usage.
+ */
+ if (!(clone_flags & CLONE_NNP) &&
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2251,13 +2286,18 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+ unsigned flags = PIDFD_STALE;
+
+ if (clone_flags & CLONE_THREAD)
+ flags |= PIDFD_THREAD;
+ if (clone_flags & CLONE_PIDFD_AUTOKILL)
+ flags |= PIDFD_AUTOKILL;
/*
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+ retval = pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2413,6 +2453,9 @@ __latent_entropy struct task_struct *copy_process(
*/
copy_seccomp(p);
+ if (clone_flags & CLONE_NNP)
+ task_set_no_new_privs(p);
+
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2436,6 +2479,8 @@ __latent_entropy struct task_struct *copy_process(
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
+ if (clone_flags & CLONE_AUTOREAP)
+ p->signal->autoreap = 1;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
@@ -2897,7 +2942,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
+ CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL))
return false;
/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 392ec2f75f01..68c17daef8d4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
if (!dead && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, tracer))
dead = do_notify_parent(p, p->exit_signal);
- else if (ignoring_children(tracer->sighand)) {
+ else if (ignoring_children(tracer->sighand) ||
+ p->signal->autoreap) {
__wake_up_parent(p, tracer);
dead = true;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index d65d0fe24bfb..e61f39fa8c8a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ if (!tsk->ptrace && tsk->signal->autoreap) {
+ autoreap = true;
+ sig = 0;
+ }
/*
* Send with __send_signal as si_pid and si_uid are in the
* parent's namespaces.