From 0355dcae2d157d95f234a56b540ab110350fd022 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:22 +0100 Subject: ns: don't skip active reference count initialization Don't skip active reference count initialization for initial namespaces. Doing this will break network namespace active reference counting. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-1-ae8a4ad5a3b3@kernel.org Fixes: 3a18f809184b ("ns: add active reference count") Signed-off-by: Christian Brauner --- kernel/nscommon.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 6fe1c747fa46..d67ae7ad7759 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -54,7 +54,7 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { - int ret; + int ret = 0; refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; @@ -74,11 +74,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - ret = proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); if (ret) return ret; /* -- cgit v1.2.3 From 7cd3d204412b0584df38fd7be20002137f34721a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:23 +0100 Subject: ns: don't increment or decrement initial namespaces There's no need to bump the active reference counts of initial namespaces as they're always active and can simply remain at 1. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-2-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- kernel/nscommon.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/nscommon.c b/kernel/nscommon.c index d67ae7ad7759..70cb66232e4c 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -177,6 +177,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns) ns = ns_owner(ns); if (!ns) return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (!atomic_dec_and_test(&ns->__ns_ref_active)) return; } @@ -276,6 +277,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns) */ void __ns_ref_active_resurrect(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + /* If we didn't resurrect the namespace we're done. */ if (atomic_fetch_add(1, &ns->__ns_ref_active)) return; @@ -289,6 +294,7 @@ void __ns_ref_active_resurrect(struct ns_common *ns) if (!ns) return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (atomic_fetch_add(1, &ns->__ns_ref_active)) return; } -- cgit v1.2.3 From 2ec2aff3c8e2523f3bde90e78031bae811335f3c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:24 +0100 Subject: ns: make sure reference are dropped outside of rcu lock The mount namespace may in fact sleep when putting the last passive reference so we need to drop the namespace reference outside of the rcu read lock. Do this by delaying the put until the next iteration where we've already moved on to the next namespace and legitimized it. Once we drop the rcu read lock to call put_user() we will also drop the reference to the previous namespace in the tree. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-3-ae8a4ad5a3b3@kernel.org Fixes: 76b6f5dfb3fd ("nstree: add listns()") Signed-off-by: Christian Brauner --- kernel/nstree.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/nstree.c b/kernel/nstree.c index 4a8838683b6b..55b72d4f8de4 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -505,13 +505,13 @@ static inline bool __must_check may_list_ns(const struct klistns *kls, return false; } -static void __ns_put(struct ns_common *ns) +static inline void ns_put(struct ns_common *ns) { - if (ns->ops) + if (ns && ns->ops) ns->ops->put(ns); } -DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, struct ns_common *candidate) @@ -535,7 +535,7 @@ static ssize_t do_listns_userns(struct klistns *kls) { u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; - struct ns_common *ns = NULL, *first_ns = NULL; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; const struct list_head *head; ssize_t ret; @@ -568,9 +568,10 @@ static ssize_t do_listns_userns(struct klistns *kls) if (!first_ns) first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); + for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { - struct ns_common *valid __free(ns_put); + struct ns_common *valid; valid = legitimize_ns(kls, ns); if (!valid) @@ -578,8 +579,14 @@ static ssize_t do_listns_userns(struct klistns *kls) rcu_read_unlock(); - if (put_user(valid->ns_id, ns_ids + ret)) + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); return -EINVAL; + } + nr_ns_ids--; ret++; @@ -587,6 +594,7 @@ static ssize_t do_listns_userns(struct klistns *kls) } rcu_read_unlock(); + ns_put(prev); return ret; } @@ -668,7 +676,7 @@ static ssize_t do_listns(struct klistns *kls) { u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; - struct ns_common *ns, *first_ns = NULL; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; struct ns_tree *ns_tree = NULL; const struct list_head *head; u32 ns_type; @@ -705,7 +713,7 @@ static ssize_t do_listns(struct klistns *kls) for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; ns = next_ns_common(ns, ns_tree)) { - struct ns_common *valid __free(ns_put); + struct ns_common *valid; valid = legitimize_ns(kls, ns); if (!valid) @@ -713,8 +721,13 @@ static ssize_t do_listns(struct klistns *kls) rcu_read_unlock(); - if (put_user(valid->ns_id, ns_ids + ret)) + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); return -EINVAL; + } nr_ns_ids--; ret++; @@ -723,6 +736,7 @@ static ssize_t do_listns(struct klistns *kls) } rcu_read_unlock(); + ns_put(prev); return ret; } -- cgit v1.2.3 From a51dce7c3261d26e574b35da71dac3903bb35ae2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:25 +0100 Subject: ns: return EFAULT on put_user() error Don't return EINVAL, return EFAULT just like we do in other system calls. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-4-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- kernel/nstree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/nstree.c b/kernel/nstree.c index 55b72d4f8de4..f27f772a6762 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -584,7 +584,7 @@ static ssize_t do_listns_userns(struct klistns *kls) if (put_user(valid->ns_id, ns_ids + ret)) { ns_put(prev); - return -EINVAL; + return -EFAULT; } nr_ns_ids--; @@ -726,7 +726,7 @@ static ssize_t do_listns(struct klistns *kls) if (put_user(valid->ns_id, ns_ids + ret)) { ns_put(prev); - return -EINVAL; + return -EFAULT; } nr_ns_ids--; -- cgit v1.2.3 From f8d5a8970d2f49411824fb1fdd34bbb3eea22756 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:26 +0100 Subject: ns: handle setns(pidfd, ...) cleanly The setns() system call supports: (1) namespace file descriptors (nsfd) (2) process file descriptors (pidfd) When using nsfds the namespaces will remain active because they are pinned by the vfs. However, when pidfds are used things are more complicated. When the target task exits and passes through exit_nsproxy_namespaces() or is reaped and thus also passes through exit_cred_namespaces() after the setns()'ing task has called prepare_nsset() but before the active reference count of the set of namespaces it wants to setns() to might have been dropped already: P1 P2 pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) pidfd = pidfd_open(pid_p1) setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) prepare_nsset() exit(0) // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 1 -> exit_nsproxy_namespaces() -> exit_cred_namespaces() // ns_active_ref_put() will also put // the reference on the owner of the // namespace. If the only reason the // owning namespace was alive was // because it was a parent of @ns // it's active reference count now goes // to zero... -------------------------------- // | // ns->__ns_active_ref == 0 | // parent_ns->__ns_active_ref == 0 | | commit_nsset() -----------------> // If setns() // now manages to install the namespaces // it will call ns_active_ref_get() // on them thus bumping the active reference // count from zero again but without also // taking the required reference on the owner. // Thus we get: // // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 0 When later someone does ns_active_ref_put() on @ns it will underflow parent_ns->__ns_active_ref leading to a splat from our asserts thinking there are still active references when in fact the counter just underflowed. So resurrect the ownership chain if necessary as well. If the caller succeeded to grab passive references to the set of namespaces the setns() should simply succeed even if the target task exists or gets reaped in the meantime and thus has dropped all active references to its namespaces. The race is rare and can only be triggered when using pidfs to setns() to namespaces. Also note that active reference on initial namespaces are nops. Since we now always handle parent references directly we can drop ns_ref_active_get_owner() when adding a namespace to a namespace tree. This is now all handled uniformly in the places where the new namespaces actually become active. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-5-ae8a4ad5a3b3@kernel.org Fixes: 3c9820d5c64a ("ns: add active reference count") Reported-by: syzbot+1957b26299cf3ff7890c@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- kernel/nscommon.c | 21 ++++++++++++--------- kernel/nstree.c | 8 -------- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 70cb66232e4c..bfd2d6805776 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -114,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns) return to_ns_common(owner); } -void __ns_ref_active_get_owner(struct ns_common *ns) -{ - ns = ns_owner(ns); - if (ns) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} - /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. @@ -171,8 +164,18 @@ void __ns_ref_active_get_owner(struct ns_common *ns) * The iteration stops once we reach a namespace that still has active * references. */ -void __ns_ref_active_put_owner(struct ns_common *ns) +void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + for (;;) { ns = ns_owner(ns); if (!ns) @@ -275,7 +278,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns) * it also needs to take another reference on its owning user namespace * and so on. */ -void __ns_ref_active_resurrect(struct ns_common *ns) +void __ns_ref_active_get(struct ns_common *ns) { /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) diff --git a/kernel/nstree.c b/kernel/nstree.c index f27f772a6762..97404fb90749 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_sequnlock(&ns_tree_lock); VFS_WARN_ON_ONCE(node); - - /* - * Take an active reference on the owner namespace. This ensures - * that the owner remains visible while any of its child namespaces - * are active. For init namespaces this is a no-op as ns_owner() - * returns NULL for namespaces owned by init_user_ns. - */ - __ns_ref_active_get_owner(ns); } void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) -- cgit v1.2.3 From 57b39aabb99ea69b9046df2915404a931d9d6695 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:27 +0100 Subject: ns: add asserts for active refcount underflow Add a few more assert to detect active reference count underflows. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-6-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- kernel/nscommon.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/nscommon.c b/kernel/nscommon.c index bfd2d6805776..c910b979e433 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -170,8 +170,10 @@ void __ns_ref_active_put(struct ns_common *ns) if (is_ns_init_id(ns)) return; - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } VFS_WARN_ON_ONCE(is_ns_init_id(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); @@ -181,8 +183,10 @@ void __ns_ref_active_put(struct ns_common *ns) if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } } } @@ -280,12 +284,16 @@ void __ns_ref_active_put(struct ns_common *ns) */ void __ns_ref_active_get(struct ns_common *ns) { + int prev; + /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; /* If we didn't resurrect the namespace we're done. */ - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; /* @@ -298,7 +306,9 @@ void __ns_ref_active_get(struct ns_common *ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; } } -- cgit v1.2.3