summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2026-03-11 23:01:17 +0100
committerChristian Brauner <brauner@kernel.org>2026-03-12 13:33:55 +0100
commit4e9f7592b6f5fe4929b2d755785788acba123db5 (patch)
tree233eab3a77f5350fd79f93b625f30b861203850b
parent0209e31659d6908c6d0788c8a495b43d0a1f6f6c (diff)
parent5b8ffd63fbd94fe71f1baf50a55e31be54a97ca9 (diff)
Merge patch series "namespace: allow creating empty mount namespaces"
Christian Brauner <brauner@kernel.org> says: Currently, creating a new mount namespace always copies the entire mount tree from the caller's namespace. For containers and sandboxes that intend to build their mount table from scratch this is wasteful: they inherit a potentially large mount tree only to immediately tear it down. This series adds support for creating a mount namespace that contains only a clone of the root mount, with none of the child mounts. Two new flags are introduced: - CLONE_EMPTY_MNTNS (0x400000000) for clone3(), using the 64-bit flag space. - UNSHARE_EMPTY_MNTNS (0x00100000) for unshare(), reusing the CLONE_PARENT_SETTID bit which has no meaning for unshare. Both flags imply CLONE_NEWNS. The resulting namespace contains a single nullfs root mount with an immutable empty directory. The intended workflow is to then mount a real filesystem (e.g., tmpfs) over the root and build the mount table from there. * patches from https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-0-6eb30529bbb0@kernel.org: selftests/filesystems: add clone3 tests for empty mount namespaces selftests/filesystems: add tests for empty mount namespaces namespace: allow creating empty mount namespaces Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-0-6eb30529bbb0@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--fs/namespace.c85
-rw-r--r--include/uapi/linux/sched.h7
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/nsproxy.c21
-rw-r--r--tools/testing/selftests/filesystems/empty_mntns/.gitignore4
-rw-r--r--tools/testing/selftests/filesystems/empty_mntns/Makefile12
-rw-r--r--tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c938
-rw-r--r--tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h50
-rw-r--r--tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c725
-rw-r--r--tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c225
-rw-r--r--tools/testing/selftests/filesystems/utils.c4
-rw-r--r--tools/testing/selftests/filesystems/utils.h2
12 files changed, 2052 insertions, 38 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 702e93243505..555f0a10de9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
- struct vfsmount *rootmnt __free(mntput) = NULL;
- struct vfsmount *pwdmnt __free(mntput) = NULL;
+ struct path old_root __free(path_put) = {};
+ struct path old_pwd __free(path_put) = {};
struct mount *p, *q;
struct mount *old;
struct mount *new;
@@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
return new_ns;
guard(namespace_excl)();
- /* First pass: copy the tree topology */
- copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
+
+ if (flags & CLONE_EMPTY_MNTNS)
+ copy_flags = 0;
+ else
+ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
- new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+
+ if (flags & CLONE_EMPTY_MNTNS)
+ new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
+ else
+ new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
emptied_ns = new_ns;
return ERR_CAST(new);
@@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
}
new_ns->root = new;
- /*
- * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
- * as belonging to new namespace. We have already acquired a private
- * fs_struct, so tsk->fs->lock is not needed.
- */
- p = old;
- q = new;
- while (p) {
- mnt_add_to_ns(new_ns, q);
- new_ns->nr_mounts++;
+ if (flags & CLONE_EMPTY_MNTNS) {
+ /*
+ * Empty mount namespace: only the root mount exists.
+ * Reset root and pwd to the cloned mount's root dentry.
+ */
if (new_fs) {
- if (&p->mnt == new_fs->root.mnt) {
- new_fs->root.mnt = mntget(&q->mnt);
- rootmnt = &p->mnt;
- }
- if (&p->mnt == new_fs->pwd.mnt) {
- new_fs->pwd.mnt = mntget(&q->mnt);
- pwdmnt = &p->mnt;
+ old_root = new_fs->root;
+ old_pwd = new_fs->pwd;
+
+ new_fs->root.mnt = mntget(&new->mnt);
+ new_fs->root.dentry = dget(new->mnt.mnt_root);
+
+ new_fs->pwd.mnt = mntget(&new->mnt);
+ new_fs->pwd.dentry = dget(new->mnt.mnt_root);
+ }
+ mnt_add_to_ns(new_ns, new);
+ new_ns->nr_mounts++;
+ } else {
+ /*
+ * Full copy: walk old and new trees in parallel, switching
+ * the tsk->fs->* elements and marking new vfsmounts as
+ * belonging to new namespace. We have already acquired a
+ * private fs_struct, so tsk->fs->lock is not needed.
+ */
+ p = old;
+ q = new;
+ while (p) {
+ mnt_add_to_ns(new_ns, q);
+ new_ns->nr_mounts++;
+ if (new_fs) {
+ if (&p->mnt == new_fs->root.mnt) {
+ old_root.mnt = new_fs->root.mnt;
+ new_fs->root.mnt = mntget(&q->mnt);
+ }
+ if (&p->mnt == new_fs->pwd.mnt) {
+ old_pwd.mnt = new_fs->pwd.mnt;
+ new_fs->pwd.mnt = mntget(&q->mnt);
+ }
}
+ p = next_mnt(p, old);
+ q = next_mnt(q, new);
+ if (!q)
+ break;
+ // an mntns binding we'd skipped?
+ while (p->mnt.mnt_root != q->mnt.mnt_root)
+ p = next_mnt(skip_mnt_tree(p), old);
}
- p = next_mnt(p, old);
- q = next_mnt(q, new);
- if (!q)
- break;
- // an mntns binding we'd skipped?
- while (p->mnt.mnt_root != q->mnt.mnt_root)
- p = next_mnt(skip_mnt_tree(p), old);
}
ns_tree_add_raw(new_ns);
return new_ns;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 359a14cc76a4..4e76fce9f777 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,6 +36,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -43,6 +44,12 @@
*/
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
+/*
+ * unshare flags share the bit space with clone flags but only apply to the
+ * unshare syscall:
+ */
+#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
+
#ifndef __ASSEMBLY__
/**
* struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..dea6b3454447 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2620,6 +2620,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
pid_t nr;
/*
+ * Creating an empty mount namespace implies creating a new mount
+ * namespace. Set this before copy_process() so that the
+ * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+ */
+ if (clone_flags & CLONE_EMPTY_MNTNS) {
+ clone_flags |= CLONE_NEWNS;
+ args->flags = clone_flags;
+ }
+
+ /*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
@@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+ CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS))
return false;
/*
@@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
- CLONE_NEWTIME))
+ CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
@@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags)
/*
* If unsharing namespace, must also unshare filesystem information.
*/
+ if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+ unshare_flags |= CLONE_NEWNS;
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 259c4b4f1eeb..1bdc5be2dd20 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
if (!new_nsp)
return ERR_PTR(-ENOMEM);
- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
+ user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
@@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
struct user_namespace *user_ns;
+ u64 flags = unshare_flags;
int err = 0;
- if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
- CLONE_NEWTIME)))
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ /*
+ * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
+ * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
+ */
+ if (flags & UNSHARE_EMPTY_MNTNS) {
+ flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
+ flags |= CLONE_EMPTY_MNTNS;
+ }
+
+ *new_nsp = create_new_namespaces(flags, current, user_ns,
new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
diff --git a/tools/testing/selftests/filesystems/empty_mntns/.gitignore b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
new file mode 100644
index 000000000000..99f89d329db2
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+clone3_empty_mntns_test
+empty_mntns_test
+overmount_chroot_test
diff --git a/tools/testing/selftests/filesystems/empty_mntns/Makefile b/tools/testing/selftests/filesystems/empty_mntns/Makefile
new file mode 100644
index 000000000000..22e3fb915e81
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test clone3_empty_mntns_test
+
+include ../../lib.mk
+
+$(OUTPUT)/empty_mntns_test: ../utils.c
+$(OUTPUT)/overmount_chroot_test: ../utils.c
+$(OUTPUT)/clone3_empty_mntns_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
new file mode 100644
index 000000000000..130cc1a1b407
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
@@ -0,0 +1,938 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS
+ *
+ * These tests exercise the clone3() code path for creating empty mount
+ * namespaces, which is distinct from the unshare() path tested in
+ * empty_mntns_test.c. With clone3(), CLONE_EMPTY_MNTNS (0x400000000ULL)
+ * is a 64-bit flag that implies CLONE_NEWNS. The implication happens in
+ * kernel_clone() before copy_process(), unlike unshare() where it goes
+ * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in
+ * unshare_nsproxy_namespaces().
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "clone3/clone3_selftests.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static pid_t clone3_empty_mntns(uint64_t extra_flags)
+{
+ struct __clone_args args = {
+ .flags = CLONE_EMPTY_MNTNS | extra_flags,
+ .exit_signal = SIGCHLD,
+ };
+
+ return sys_clone3(&args, sizeof(args));
+}
+
+static bool clone3_empty_mntns_supported(void)
+{
+ pid_t pid;
+ int status;
+
+ pid = fork();
+ if (pid < 0)
+ return false;
+
+ if (pid == 0) {
+ if (enter_userns())
+ _exit(1);
+
+ pid = clone3_empty_mntns(0);
+ if (pid < 0)
+ _exit(1);
+
+ if (pid == 0)
+ _exit(0);
+
+ _exit(wait_for_pid(pid) != 0);
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return false;
+
+ if (!WIFEXITED(status))
+ return false;
+
+ return WEXITSTATUS(status) == 0;
+}
+
+FIXTURE(clone3_empty_mntns) {};
+
+FIXTURE_SETUP(clone3_empty_mntns)
+{
+ if (!clone3_empty_mntns_supported())
+ SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported");
+}
+
+FIXTURE_TEARDOWN(clone3_empty_mntns) {}
+
+/*
+ * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace
+ * with exactly 1 mount and root == cwd.
+ */
+TEST_F(clone3_empty_mntns, basic)
+{
+ pid_t pid, inner;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ if (enter_userns())
+ _exit(1);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(2);
+
+ if (inner == 0) {
+ uint64_t root_id, cwd_id;
+
+ if (count_mounts() != 1)
+ _exit(3);
+
+ root_id = get_unique_mnt_id("/");
+ cwd_id = get_unique_mnt_id(".");
+ if (root_id == 0 || cwd_id == 0)
+ _exit(4);
+
+ if (root_id != cwd_id)
+ _exit(5);
+
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS implies CLONE_NEWNS. Verify that it works without
+ * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630).
+ */
+TEST_F(clone3_empty_mntns, implies_newns)
+{
+ pid_t pid, inner;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ssize_t parent_mounts;
+
+ if (enter_userns())
+ _exit(1);
+
+ /* Verify we have mounts in our current namespace. */
+ parent_mounts = count_mounts();
+ if (parent_mounts < 1)
+ _exit(2);
+
+ /* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(3);
+
+ if (inner == 0) {
+ if (count_mounts() != 1)
+ _exit(4);
+
+ _exit(0);
+ }
+
+ /* Parent still has its mounts. */
+ if (count_mounts() != parent_mounts)
+ _exit(5);
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS |
+ * @extra_flags and verifies the child has exactly one mount.
+ */
+#define TEST_CLONE3_FLAGS(test_name, extra_flags) \
+TEST_F(clone3_empty_mntns, test_name) \
+{ \
+ pid_t pid, inner; \
+ \
+ pid = fork(); \
+ ASSERT_GE(pid, 0); \
+ \
+ if (pid == 0) { \
+ if (enter_userns()) \
+ _exit(1); \
+ \
+ inner = clone3_empty_mntns(extra_flags); \
+ if (inner < 0) \
+ _exit(2); \
+ \
+ if (inner == 0) { \
+ if (count_mounts() != 1) \
+ _exit(3); \
+ _exit(0); \
+ } \
+ \
+ _exit(wait_for_pid(inner)); \
+ } \
+ \
+ ASSERT_EQ(wait_for_pid(pid), 0); \
+}
+
+/* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */
+TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS)
+
+/* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */
+TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER)
+
+/* CLONE_EMPTY_MNTNS combined with other namespace flags. */
+TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC)
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID.
+ */
+TEST_F(clone3_empty_mntns, with_newpid)
+{
+ pid_t pid, inner;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ if (enter_userns())
+ _exit(1);
+
+ inner = clone3_empty_mntns(CLONE_NEWPID);
+ if (inner < 0)
+ _exit(2);
+
+ if (inner == 0) {
+ if (count_mounts() != 1)
+ _exit(3);
+
+ /* In a new PID namespace, getpid() returns 1. */
+ if (getpid() != 1)
+ _exit(4);
+
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS
+ * and CLONE_FS are mutually exclusive (fork.c:1981).
+ */
+TEST_F(clone3_empty_mntns, with_clone_fs_fails)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct __clone_args args = {
+ .flags = CLONE_EMPTY_MNTNS | CLONE_FS,
+ .exit_signal = SIGCHLD,
+ };
+ pid_t ret;
+
+ if (enter_userns())
+ _exit(1);
+
+ ret = sys_clone3(&args, sizeof(args));
+ if (ret >= 0) {
+ if (ret == 0)
+ _exit(0);
+ wait_for_pid(ret);
+ _exit(2);
+ }
+
+ if (errno != EINVAL)
+ _exit(3);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd.
+ */
+TEST_F(clone3_empty_mntns, with_pidfd)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct __clone_args args = {
+ .flags = CLONE_EMPTY_MNTNS | CLONE_PIDFD,
+ .exit_signal = SIGCHLD,
+ };
+ int pidfd = -1;
+ pid_t inner;
+
+ if (enter_userns())
+ _exit(1);
+
+ args.pidfd = (uintptr_t)&pidfd;
+
+ inner = sys_clone3(&args, sizeof(args));
+ if (inner < 0)
+ _exit(2);
+
+ if (inner == 0) {
+ if (count_mounts() != 1)
+ _exit(3);
+
+ _exit(0);
+ }
+
+ /* Verify we got a valid pidfd. */
+ if (pidfd < 0)
+ _exit(4);
+
+ close(pidfd);
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * clone3 without CAP_SYS_ADMIN must fail with EPERM.
+ */
+TEST_F(clone3_empty_mntns, eperm_without_caps)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pid_t ret;
+
+ /* Skip if already root. */
+ if (getuid() == 0)
+ _exit(0);
+
+ ret = clone3_empty_mntns(0);
+ if (ret >= 0) {
+ if (ret == 0)
+ _exit(0);
+ wait_for_pid(ret);
+ _exit(1);
+ }
+
+ if (errno != EPERM)
+ _exit(2);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS.
+ */
+TEST_F(clone3_empty_mntns, parent_unchanged)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ssize_t nr_before, nr_after;
+ pid_t inner;
+
+ if (enter_userns())
+ _exit(1);
+
+ nr_before = count_mounts();
+ if (nr_before < 1)
+ _exit(2);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(3);
+
+ if (inner == 0)
+ _exit(0);
+
+ if (wait_for_pid(inner) != 0)
+ _exit(4);
+
+ nr_after = count_mounts();
+ if (nr_after != nr_before)
+ _exit(5);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent with many mounts: child still gets exactly 1 mount.
+ */
+TEST_F(clone3_empty_mntns, many_parent_mounts)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX";
+ pid_t inner;
+ int i;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(CLONE_NEWNS))
+ _exit(2);
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+ _exit(3);
+
+ if (!mkdtemp(tmpdir))
+ _exit(4);
+
+ if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+ _exit(5);
+
+ for (i = 0; i < 5; i++) {
+ char subdir[256];
+
+ snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+ if (mkdir(subdir, 0755) && errno != EEXIST)
+ _exit(6);
+ if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+ _exit(7);
+ }
+
+ if (count_mounts() < 5)
+ _exit(8);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(9);
+
+ if (inner == 0) {
+ if (count_mounts() != 1)
+ _exit(10);
+
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify the child's root mount is nullfs with expected statmount properties.
+ */
+TEST_F(clone3_empty_mntns, mount_properties)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pid_t inner;
+
+ if (enter_userns())
+ _exit(1);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(2);
+
+ if (inner == 0) {
+ struct statmount *sm;
+ uint64_t root_id;
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(3);
+
+ sm = statmount_alloc(root_id, 0,
+ STATMOUNT_MNT_BASIC |
+ STATMOUNT_MNT_POINT |
+ STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(4);
+
+ /* Root mount point is "/". */
+ if (!(sm->mask & STATMOUNT_MNT_POINT))
+ _exit(5);
+ if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+ _exit(6);
+
+ /* Filesystem type is nullfs. */
+ if (!(sm->mask & STATMOUNT_FS_TYPE))
+ _exit(7);
+ if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+ _exit(8);
+
+ /* Root mount is its own parent. */
+ if (!(sm->mask & STATMOUNT_MNT_BASIC))
+ _exit(9);
+ if (sm->mnt_parent_id != sm->mnt_id)
+ _exit(10);
+
+ free(sm);
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Listmount returns only the root mount in the child's empty namespace.
+ */
+TEST_F(clone3_empty_mntns, listmount_single_entry)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pid_t inner;
+
+ if (enter_userns())
+ _exit(1);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(2);
+
+ if (inner == 0) {
+ uint64_t list[16];
+ ssize_t nr_mounts;
+ uint64_t root_id;
+
+ nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+ if (nr_mounts != 1)
+ _exit(3);
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(4);
+
+ if (list[0] != root_id)
+ _exit(5);
+
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Child can mount tmpfs over nullfs root (the primary container use case).
+ *
+ * Uses the new mount API (fsopen/fsmount/move_mount) because resolving
+ * "/" returns the process root directly without following overmounts.
+ * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs.
+ */
+TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pid_t inner;
+
+ if (enter_userns())
+ _exit(1);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(2);
+
+ if (inner == 0) {
+ struct statmount *sm;
+ uint64_t root_id;
+ int fd, fsfd, mntfd;
+
+ if (count_mounts() != 1)
+ _exit(3);
+
+ /* Verify root is nullfs. */
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(4);
+
+ sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(5);
+ if (!(sm->mask & STATMOUNT_FS_TYPE))
+ _exit(6);
+ if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+ _exit(7);
+ free(sm);
+
+ /* Create tmpfs via the new mount API. */
+ fsfd = sys_fsopen("tmpfs", 0);
+ if (fsfd < 0)
+ _exit(8);
+
+ if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING,
+ "size", "1M", 0)) {
+ close(fsfd);
+ _exit(9);
+ }
+
+ if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE,
+ NULL, NULL, 0)) {
+ close(fsfd);
+ _exit(10);
+ }
+
+ mntfd = sys_fsmount(fsfd, 0, 0);
+ close(fsfd);
+ if (mntfd < 0)
+ _exit(11);
+
+ /* Attach tmpfs to "/". */
+ if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+ MOVE_MOUNT_F_EMPTY_PATH)) {
+ close(mntfd);
+ _exit(12);
+ }
+
+ if (count_mounts() != 2) {
+ close(mntfd);
+ _exit(13);
+ }
+
+ /* Enter the tmpfs. */
+ if (fchdir(mntfd)) {
+ close(mntfd);
+ _exit(14);
+ }
+
+ if (chroot(".")) {
+ close(mntfd);
+ _exit(15);
+ }
+
+ close(mntfd);
+
+ /* Verify "/" is now tmpfs. */
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(16);
+
+ sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(17);
+ if (!(sm->mask & STATMOUNT_FS_TYPE))
+ _exit(18);
+ if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+ _exit(19);
+ free(sm);
+
+ /* Verify tmpfs is writable. */
+ fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+ if (fd < 0)
+ _exit(20);
+
+ if (write(fd, "test", 4) != 4) {
+ close(fd);
+ _exit(21);
+ }
+ close(fd);
+
+ if (access("/testfile", F_OK))
+ _exit(22);
+
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with
+ * distinct mount namespace root mount IDs.
+ */
+TEST_F(clone3_empty_mntns, repeated)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int pipe1[2], pipe2[2];
+ uint64_t id1 = 0, id2 = 0;
+ pid_t inner1, inner2;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (pipe(pipe1) || pipe(pipe2))
+ _exit(2);
+
+ inner1 = clone3_empty_mntns(0);
+ if (inner1 < 0)
+ _exit(3);
+
+ if (inner1 == 0) {
+ uint64_t root_id;
+
+ close(pipe1[0]);
+ root_id = get_unique_mnt_id("/");
+ if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+ _exit(1);
+ close(pipe1[1]);
+ _exit(0);
+ }
+
+ inner2 = clone3_empty_mntns(0);
+ if (inner2 < 0)
+ _exit(4);
+
+ if (inner2 == 0) {
+ uint64_t root_id;
+
+ close(pipe2[0]);
+ root_id = get_unique_mnt_id("/");
+ if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+ _exit(1);
+ close(pipe2[1]);
+ _exit(0);
+ }
+
+ close(pipe1[1]);
+ close(pipe2[1]);
+
+ if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1))
+ _exit(5);
+ if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2))
+ _exit(6);
+
+ close(pipe1[0]);
+ close(pipe2[0]);
+
+ if (wait_for_pid(inner1) || wait_for_pid(inner2))
+ _exit(7);
+
+ /* Each child must have a distinct root mount ID. */
+ if (id1 == 0 || id2 == 0)
+ _exit(8);
+ if (id1 == id2)
+ _exit(9);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify setns() into a child's empty mount namespace works.
+ */
+TEST_F(clone3_empty_mntns, setns_into_child_mntns)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int pipe_fd[2];
+ pid_t inner;
+ char c;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (pipe(pipe_fd))
+ _exit(2);
+
+ inner = clone3_empty_mntns(0);
+ if (inner < 0)
+ _exit(3);
+
+ if (inner == 0) {
+ /* Signal parent we're ready. */
+ close(pipe_fd[0]);
+ if (write(pipe_fd[1], "r", 1) != 1)
+ _exit(1);
+
+ /*
+ * Wait for parent to finish. Reading from our
+ * write end will block until the parent closes
+ * its read end, giving us an implicit barrier.
+ */
+ if (read(pipe_fd[1], &c, 1) < 0)
+ ;
+ close(pipe_fd[1]);
+ _exit(0);
+ }
+
+ close(pipe_fd[1]);
+
+ /* Wait for child to be ready. */
+ if (read(pipe_fd[0], &c, 1) != 1)
+ _exit(4);
+
+ /* Open child's mount namespace. */
+ {
+ char path[64];
+ int mntns_fd;
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner);
+ mntns_fd = open(path, O_RDONLY);
+ if (mntns_fd < 0)
+ _exit(5);
+
+ if (setns(mntns_fd, CLONE_NEWNS))
+ _exit(6);
+
+ close(mntns_fd);
+ }
+
+ /* Now we should be in the child's empty mntns. */
+ if (count_mounts() != 1)
+ _exit(7);
+
+ close(pipe_fd[0]);
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require CLONE_EMPTY_MNTNS support.
+ */
+
+/*
+ * Unknown 64-bit flags beyond the known set are rejected.
+ */
+TEST(unknown_flags_rejected)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct __clone_args args = {
+ .flags = 0x800000000ULL,
+ .exit_signal = SIGCHLD,
+ };
+ pid_t ret;
+
+ ret = sys_clone3(&args, sizeof(args));
+ if (ret >= 0) {
+ if (ret == 0)
+ _exit(0);
+ wait_for_pid(ret);
+ _exit(1);
+ }
+
+ if (errno != EINVAL)
+ _exit(2);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still
+ * copies the full mount tree.
+ */
+TEST(clone3_newns_full_copy)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct __clone_args args = {
+ .flags = CLONE_NEWNS,
+ .exit_signal = SIGCHLD,
+ };
+ ssize_t parent_mounts;
+ pid_t inner;
+
+ if (enter_userns())
+ _exit(1);
+
+ parent_mounts = count_mounts();
+ if (parent_mounts < 1)
+ _exit(2);
+
+ inner = sys_clone3(&args, sizeof(args));
+ if (inner < 0)
+ _exit(3);
+
+ if (inner == 0) {
+ /* Full copy should have at least as many mounts. */
+ if (count_mounts() < parent_mounts)
+ _exit(1);
+
+ _exit(0);
+ }
+
+ _exit(wait_for_pid(inner));
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
new file mode 100644
index 000000000000..dfd24c88eec8
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef EMPTY_MNTNS_H
+#define EMPTY_MNTNS_H
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "../statmount/statmount.h"
+
+#ifndef UNSHARE_EMPTY_MNTNS
+#define UNSHARE_EMPTY_MNTNS 0x00100000
+#endif
+
+#ifndef CLONE_EMPTY_MNTNS
+#define CLONE_EMPTY_MNTNS (1ULL << 37)
+#endif
+
+static inline ssize_t count_mounts(void)
+{
+ uint64_t list[4096];
+
+ return listmount(LSMT_ROOT, 0, 0, list, sizeof(list) / sizeof(list[0]), 0);
+}
+
+static inline struct statmount *statmount_alloc(uint64_t mnt_id,
+ uint64_t mnt_ns_id,
+ uint64_t mask)
+{
+ size_t bufsize = 1 << 15;
+ struct statmount *buf;
+ int ret;
+
+ for (;;) {
+ buf = malloc(bufsize);
+ if (!buf)
+ return NULL;
+
+ ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, 0);
+ if (ret == 0)
+ return buf;
+
+ free(buf);
+ if (errno != EOVERFLOW)
+ return NULL;
+
+ bufsize <<= 1;
+ }
+}
+
+#endif /* EMPTY_MNTNS_H */
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
new file mode 100644
index 000000000000..733aad83dbbf
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via UNSHARE_EMPTY_MNTNS
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static bool unshare_empty_mntns_supported(void)
+{
+ pid_t pid;
+ int status;
+
+ pid = fork();
+ if (pid < 0)
+ return false;
+
+ if (pid == 0) {
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS) && errno == EINVAL)
+ _exit(1);
+ _exit(0);
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return false;
+
+ if (!WIFEXITED(status))
+ return false;
+
+ return WEXITSTATUS(status) == 0;
+}
+
+
+FIXTURE(empty_mntns) {};
+
+FIXTURE_SETUP(empty_mntns)
+{
+ if (!unshare_empty_mntns_supported())
+ SKIP(return, "UNSHARE_EMPTY_MNTNS not supported");
+}
+
+FIXTURE_TEARDOWN(empty_mntns) {}
+
+/* Verify unshare succeeds, produces exactly 1 mount, and root == cwd */
+TEST_F(empty_mntns, basic)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ uint64_t root_id, cwd_id;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(2);
+
+ if (count_mounts() != 1)
+ _exit(3);
+
+ root_id = get_unique_mnt_id("/");
+ cwd_id = get_unique_mnt_id(".");
+ if (root_id == 0 || cwd_id == 0)
+ _exit(4);
+
+ if (root_id != cwd_id)
+ _exit(5);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * UNSHARE_EMPTY_MNTNS combined with CLONE_NEWUSER.
+ *
+ * The user namespace must be created first so /proc is still accessible
+ * for writing uid_map/gid_map. The empty mount namespace is created
+ * afterwards.
+ */
+TEST_F(empty_mntns, with_clone_newuser)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ uid_t uid = getuid();
+ gid_t gid = getgid();
+ char map[100];
+
+ if (unshare(CLONE_NEWUSER))
+ _exit(1);
+
+ snprintf(map, sizeof(map), "0 %d 1", uid);
+ if (write_file("/proc/self/uid_map", map))
+ _exit(2);
+
+ if (write_file("/proc/self/setgroups", "deny"))
+ _exit(3);
+
+ snprintf(map, sizeof(map), "0 %d 1", gid);
+ if (write_file("/proc/self/gid_map", map))
+ _exit(4);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(5);
+
+ if (count_mounts() != 1)
+ _exit(6);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* UNSHARE_EMPTY_MNTNS combined with other namespace flags */
+TEST_F(empty_mntns, with_other_ns_flags)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS | CLONE_NEWUTS | CLONE_NEWIPC))
+ _exit(2);
+
+ if (count_mounts() != 1)
+ _exit(3);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* EPERM without proper capabilities */
+TEST_F(empty_mntns, eperm_without_caps)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Skip if already root */
+ if (getuid() == 0)
+ _exit(0);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS) == 0)
+ _exit(1);
+
+ if (errno != EPERM)
+ _exit(2);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Many source mounts still result in exactly 1 mount */
+TEST_F(empty_mntns, many_source_mounts)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ char tmpdir[] = "/tmp/empty_mntns_test.XXXXXX";
+ int i;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(CLONE_NEWNS))
+ _exit(2);
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+ _exit(3);
+
+ if (!mkdtemp(tmpdir))
+ _exit(4);
+
+ if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+ _exit(5);
+
+ for (i = 0; i < 5; i++) {
+ char subdir[256];
+
+ snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+ if (mkdir(subdir, 0755) && errno != EEXIST)
+ _exit(6);
+ if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+ _exit(7);
+ }
+
+ if (count_mounts() < 5)
+ _exit(8);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(9);
+
+ if (count_mounts() != 1)
+ _exit(10);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* CWD on a different mount gets reset to root */
+TEST_F(empty_mntns, cwd_reset)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ char tmpdir[] = "/tmp/empty_mntns_cwd.XXXXXX";
+ uint64_t root_id, cwd_id;
+ struct statmount *sm;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(CLONE_NEWNS))
+ _exit(2);
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+ _exit(3);
+
+ if (!mkdtemp(tmpdir))
+ _exit(4);
+
+ if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+ _exit(5);
+
+ if (chdir(tmpdir))
+ _exit(6);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(7);
+
+ root_id = get_unique_mnt_id("/");
+ cwd_id = get_unique_mnt_id(".");
+ if (root_id == 0 || cwd_id == 0)
+ _exit(8);
+
+ if (root_id != cwd_id)
+ _exit(9);
+
+ sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
+ if (!sm)
+ _exit(10);
+
+ if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+ _exit(11);
+
+ free(sm);
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Verify statmount properties of the root mount */
+TEST_F(empty_mntns, mount_properties)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct statmount *sm;
+ uint64_t root_id;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(2);
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(3);
+
+ sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
+ STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(4);
+
+ if (!(sm->mask & STATMOUNT_MNT_POINT))
+ _exit(5);
+
+ if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+ _exit(6);
+
+ if (!(sm->mask & STATMOUNT_MNT_BASIC))
+ _exit(7);
+
+ if (sm->mnt_id != root_id)
+ _exit(8);
+
+ free(sm);
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Consecutive UNSHARE_EMPTY_MNTNS calls produce new namespaces */
+TEST_F(empty_mntns, repeated_unshare)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ uint64_t first_root_id, second_root_id;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(2);
+
+ if (count_mounts() != 1)
+ _exit(3);
+
+ first_root_id = get_unique_mnt_id("/");
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(4);
+
+ if (count_mounts() != 1)
+ _exit(5);
+
+ second_root_id = get_unique_mnt_id("/");
+
+ if (first_root_id == second_root_id)
+ _exit(6);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Root mount's parent is itself */
+TEST_F(empty_mntns, root_is_own_parent)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct statmount sm;
+ uint64_t root_id;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(2);
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(3);
+
+ if (statmount(root_id, 0, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0) < 0)
+ _exit(4);
+
+ if (!(sm.mask & STATMOUNT_MNT_BASIC))
+ _exit(5);
+
+ if (sm.mnt_parent_id != sm.mnt_id)
+ _exit(6);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Listmount returns only the root mount */
+TEST_F(empty_mntns, listmount_single_entry)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ uint64_t list[16];
+ ssize_t nr_mounts;
+ uint64_t root_id;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(2);
+
+ nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+ if (nr_mounts != 1)
+ _exit(3);
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(4);
+
+ if (list[0] != root_id)
+ _exit(5);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Mount tmpfs over nullfs root to build a writable filesystem from scratch.
+ * This exercises the intended usage pattern: create an empty mount namespace
+ * (which has a nullfs root), then mount a real filesystem over it.
+ *
+ * Because resolving "/" returns the process root directly (via nd_jump_root)
+ * without following overmounts, we use the new mount API (fsopen/fsmount)
+ * to obtain a mount fd, then fchdir + chroot to enter the new filesystem.
+ */
+TEST_F(empty_mntns, overmount_tmpfs)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct statmount *sm;
+ uint64_t root_id, cwd_id;
+ int fd, fsfd, mntfd;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(UNSHARE_EMPTY_MNTNS))
+ _exit(2);
+
+ if (count_mounts() != 1)
+ _exit(3);
+
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(4);
+
+ /* Verify root is nullfs */
+ sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(5);
+
+ if (!(sm->mask & STATMOUNT_FS_TYPE))
+ _exit(6);
+
+ if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+ _exit(7);
+
+ free(sm);
+
+ cwd_id = get_unique_mnt_id(".");
+ if (!cwd_id || root_id != cwd_id)
+ _exit(8);
+
+ /*
+ * nullfs root is immutable. open(O_CREAT) returns ENOENT
+ * because empty_dir_lookup() returns -ENOENT before the
+ * IS_IMMUTABLE permission check in may_o_create() is reached.
+ */
+ fd = open("/test", O_CREAT | O_RDWR, 0644);
+ if (fd >= 0) {
+ close(fd);
+ _exit(9);
+ }
+ if (errno != ENOENT)
+ _exit(10);
+
+ /*
+ * Use the new mount API to create tmpfs and get a mount fd.
+ * We need the fd because after attaching the tmpfs on top of
+ * "/", path resolution of "/" still returns the process root
+ * (nullfs) without following the overmount. The mount fd
+ * lets us fchdir + chroot into the tmpfs.
+ */
+ fsfd = sys_fsopen("tmpfs", 0);
+ if (fsfd < 0)
+ _exit(11);
+
+ if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "size", "1M", 0)) {
+ close(fsfd);
+ _exit(12);
+ }
+
+ if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) {
+ close(fsfd);
+ _exit(13);
+ }
+
+ mntfd = sys_fsmount(fsfd, 0, 0);
+ close(fsfd);
+ if (mntfd < 0)
+ _exit(14);
+
+ if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+ MOVE_MOUNT_F_EMPTY_PATH)) {
+ close(mntfd);
+ _exit(15);
+ }
+
+ if (count_mounts() != 2) {
+ close(mntfd);
+ _exit(16);
+ }
+
+ /* Enter the tmpfs via the mount fd */
+ if (fchdir(mntfd)) {
+ close(mntfd);
+ _exit(17);
+ }
+
+ if (chroot(".")) {
+ close(mntfd);
+ _exit(18);
+ }
+
+ close(mntfd);
+
+ /* Verify "/" now resolves to tmpfs */
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ _exit(19);
+
+ sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(20);
+
+ if (!(sm->mask & STATMOUNT_FS_TYPE))
+ _exit(21);
+
+ if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+ _exit(22);
+
+ free(sm);
+
+ /* Verify tmpfs is writable */
+ fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+ if (fd < 0)
+ _exit(23);
+
+ if (write(fd, "test", 4) != 4) {
+ close(fd);
+ _exit(24);
+ }
+
+ close(fd);
+
+ if (access("/testfile", F_OK))
+ _exit(25);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require UNSHARE_EMPTY_MNTNS support.
+ */
+
+/* Invalid unshare flags return EINVAL */
+TEST(invalid_flags)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(0x80000000) == 0)
+ _exit(2);
+
+ if (errno != EINVAL)
+ _exit(3);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Regular CLONE_NEWNS still copies the full mount tree */
+TEST(clone_newns_full_copy)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ssize_t nr_mounts_before, nr_mounts_after;
+ char tmpdir[] = "/tmp/empty_mntns_regr.XXXXXX";
+ int i;
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(CLONE_NEWNS))
+ _exit(2);
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+ _exit(3);
+
+ if (!mkdtemp(tmpdir))
+ _exit(4);
+
+ if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+ _exit(5);
+
+ for (i = 0; i < 3; i++) {
+ char subdir[256];
+
+ snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+ if (mkdir(subdir, 0755) && errno != EEXIST)
+ _exit(6);
+ if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+ _exit(7);
+ }
+
+ nr_mounts_before = count_mounts();
+ if (nr_mounts_before < 3)
+ _exit(8);
+
+ if (unshare(CLONE_NEWNS))
+ _exit(9);
+
+ nr_mounts_after = count_mounts();
+ if (nr_mounts_after < nr_mounts_before)
+ _exit(10);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Other namespace unshares are unaffected */
+TEST(other_ns_unaffected)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ char hostname[256];
+
+ if (enter_userns())
+ _exit(1);
+
+ if (unshare(CLONE_NEWUTS))
+ _exit(2);
+
+ if (sethostname("test-empty-mntns", 16))
+ _exit(3);
+
+ if (gethostname(hostname, sizeof(hostname)))
+ _exit(4);
+
+ if (strcmp(hostname, "test-empty-mntns") != 0)
+ _exit(5);
+
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
new file mode 100644
index 000000000000..0b623d0c6bb9
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test: rootfs overmounted multiple times with chroot into topmost
+ *
+ * This test creates a scenario where:
+ * 1. A new mount namespace is created with a tmpfs root (via pivot_root)
+ * 2. A mountpoint is created and overmounted multiple times
+ * 3. The caller chroots into the topmost mount layer
+ *
+ * The test verifies that:
+ * - Multiple overmounts create separate mount layers
+ * - Each layer's files are isolated
+ * - chroot correctly sets the process's root to the topmost layer
+ * - After chroot, only the topmost layer's files are visible
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+#define NR_OVERMOUNTS 5
+
+/*
+ * Setup a proper root filesystem using pivot_root.
+ * This ensures we own the root directory in our user namespace.
+ */
+static int setup_root(void)
+{
+ char tmpdir[] = "/tmp/overmount_test.XXXXXX";
+ char oldroot[256];
+
+ if (!mkdtemp(tmpdir))
+ return -1;
+
+ /* Mount tmpfs at the temporary directory */
+ if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=10M"))
+ return -1;
+
+ /* Create directory for old root */
+ snprintf(oldroot, sizeof(oldroot), "%s/oldroot", tmpdir);
+ if (mkdir(oldroot, 0755))
+ return -1;
+
+ /* pivot_root to use the tmpfs as new root */
+ if (syscall(SYS_pivot_root, tmpdir, oldroot))
+ return -1;
+
+ if (chdir("/"))
+ return -1;
+
+ /* Unmount old root */
+ if (umount2("/oldroot", MNT_DETACH))
+ return -1;
+
+ /* Remove oldroot directory */
+ if (rmdir("/oldroot"))
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Test scenario:
+ * 1. Enter a user namespace to gain CAP_SYS_ADMIN
+ * 2. Create a new mount namespace
+ * 3. Setup a tmpfs root via pivot_root
+ * 4. Create a mountpoint /newroot and overmount it multiple times
+ * 5. Create a marker file in each layer
+ * 6. Chroot into /newroot (the topmost overmount)
+ * 7. Verify we're in the topmost layer (only topmost marker visible)
+ */
+TEST(overmount_chroot)
+{
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ssize_t nr_mounts;
+ uint64_t mnt_ids[NR_OVERMOUNTS + 1];
+ uint64_t root_id_before, root_id_after;
+ struct statmount *sm;
+ char marker[64];
+ int fd, i;
+
+ /* Step 1: Enter user namespace for privileges */
+ if (enter_userns())
+ _exit(1);
+
+ /* Step 2: Create a new mount namespace */
+ if (unshare(CLONE_NEWNS))
+ _exit(2);
+
+ /* Step 3: Make the mount tree private */
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+ _exit(3);
+
+ /* Step 4: Setup a proper tmpfs root via pivot_root */
+ if (setup_root())
+ _exit(4);
+
+ /* Create the base mount point for overmounting */
+ if (mkdir("/newroot", 0755))
+ _exit(5);
+
+ /* Mount base tmpfs on /newroot */
+ if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M"))
+ _exit(6);
+
+ /* Record base mount ID */
+ mnt_ids[0] = get_unique_mnt_id("/newroot");
+ if (!mnt_ids[0])
+ _exit(7);
+
+ /* Create marker in base layer */
+ fd = open("/newroot/layer_0", O_CREAT | O_RDWR, 0644);
+ if (fd < 0)
+ _exit(8);
+ if (write(fd, "layer_0", 7) != 7) {
+ close(fd);
+ _exit(9);
+ }
+ close(fd);
+
+ /* Step 5: Overmount /newroot multiple times with tmpfs */
+ for (i = 0; i < NR_OVERMOUNTS; i++) {
+ if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M"))
+ _exit(10);
+
+ /* Record mount ID for this layer */
+ mnt_ids[i + 1] = get_unique_mnt_id("/newroot");
+ if (!mnt_ids[i + 1])
+ _exit(11);
+
+ /* Create a marker file in each layer */
+ snprintf(marker, sizeof(marker), "/newroot/layer_%d", i + 1);
+ fd = open(marker, O_CREAT | O_RDWR, 0644);
+ if (fd < 0)
+ _exit(12);
+
+ if (write(fd, marker, strlen(marker)) != (ssize_t)strlen(marker)) {
+ close(fd);
+ _exit(13);
+ }
+ close(fd);
+ }
+
+ /* Verify mount count increased */
+ nr_mounts = count_mounts();
+ if (nr_mounts < NR_OVERMOUNTS + 2)
+ _exit(14);
+
+ /* Record root mount ID before chroot */
+ root_id_before = get_unique_mnt_id("/newroot");
+
+ /* Verify this is the topmost layer's mount */
+ if (root_id_before != mnt_ids[NR_OVERMOUNTS])
+ _exit(15);
+
+ /* Step 6: Chroot into /newroot (the topmost overmount) */
+ if (chroot("/newroot"))
+ _exit(16);
+
+ /* Change to root directory within the chroot */
+ if (chdir("/"))
+ _exit(17);
+
+ /* Step 7: Verify we're in the topmost layer */
+ root_id_after = get_unique_mnt_id("/");
+
+ /* The mount ID should be the same as the topmost layer */
+ if (root_id_after != mnt_ids[NR_OVERMOUNTS])
+ _exit(18);
+
+ /* Verify the topmost layer's marker file exists */
+ snprintf(marker, sizeof(marker), "/layer_%d", NR_OVERMOUNTS);
+ if (access(marker, F_OK))
+ _exit(19);
+
+ /* Verify we cannot see markers from lower layers (they're hidden) */
+ for (i = 0; i < NR_OVERMOUNTS; i++) {
+ snprintf(marker, sizeof(marker), "/layer_%d", i);
+ if (access(marker, F_OK) == 0)
+ _exit(20);
+ }
+
+ /* Verify the root mount is tmpfs */
+ sm = statmount_alloc(root_id_after, 0,
+ STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
+ STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE);
+ if (!sm)
+ _exit(21);
+
+ if (sm->mask & STATMOUNT_FS_TYPE) {
+ if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) {
+ free(sm);
+ _exit(22);
+ }
+ }
+
+ free(sm);
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index d6f26f849053..d73d7d8171db 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -158,7 +158,7 @@ static int get_userns_fd_cb(void *data)
_exit(0);
}
-static int wait_for_pid(pid_t pid)
+int wait_for_pid(pid_t pid)
{
int status, ret;
@@ -450,7 +450,7 @@ out_close:
return fret;
}
-static int write_file(const char *path, const char *val)
+int write_file(const char *path, const char *val)
{
int fd = open(path, O_WRONLY);
size_t len = strlen(val);
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
index 0bccfed666a9..d03085cef5cb 100644
--- a/tools/testing/selftests/filesystems/utils.h
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -44,6 +44,8 @@ static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
return true;
}
+extern int wait_for_pid(pid_t pid);
+extern int write_file(const char *path, const char *val);
extern uint64_t get_unique_mnt_id(const char *path);
#endif /* __IDMAP_UTILS_H */