diff options
| author | Christian Brauner <brauner@kernel.org> | 2026-02-14 16:22:13 +0100 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2026-02-18 14:04:42 +0100 |
| commit | a41dbf5e004edbe1260883c43a8bd134d9cb0c1c (patch) | |
| tree | 9a97082290f47bb92ced5e8325ee737a232171c0 | |
| parent | ac83896172798cf82ebc643cf555aa4cdd3a07da (diff) | |
mount: hold namespace_sem across copy in create_new_namespace()
Fix an oversight when creating a new mount namespace. If someone had the
bright idea to make the real rootfs a shared or dependent mount and it
is later copied the copy will become a peer of the old real rootfs mount
or a dependent mount of it. The namespace semaphore is dropped and we
use mount lock exact to lock the new real root mount. If that fails or
the subsequent do_loopback() fails we rely on the copy of the real root
mount to be cleaned up by path_put(). The problem is that this doesn't
deal with mount propagation and will leave the mounts linked in the
propagation lists.
When creating a new mount namespace create_new_namespace() first
acquires namespace_sem to clone the nullfs root, drops it, then
reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to
respect the inode_lock -> namespace_sem lock ordering. This
drop-and-reacquire pattern is fragile and was the source of the
propagation cleanup bug fixed in the preceding commit.
Extend lock_mount_exact() with a copy_mount mode that clones the mount
under the locks atomically. When copy_mount is true, path_overmounted()
is skipped since we're copying the mount, not mounting on top of it -
the nullfs root always has rootfs mounted on top so the check would
always fail. If clone_mnt() fails after get_mountpoint() has pinned the
mountpoint, __unlock_mount() is used to properly unpin the mountpoint
and release both locks.
This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which
takes inode_lock and namespace_sem once and holds them throughout the
clone and subsequent mount operations, eliminating the
drop-and-reacquire pattern entirely.
Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com
Fixes: 9b8a0ba68246 ("mount: add OPEN_TREE_NAMESPACE") # mainline only
Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
| -rw-r--r-- | fs/namespace.c | 111 |
1 files changed, 57 insertions, 54 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 90700df65f0d..022e59afcb5e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2791,7 +2791,8 @@ static inline void unlock_mount(struct pinned_mountpoint *m) } static void lock_mount_exact(const struct path *path, - struct pinned_mountpoint *mp); + struct pinned_mountpoint *mp, bool copy_mount, + unsigned int copy_flags); #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ @@ -2799,7 +2800,10 @@ static void lock_mount_exact(const struct path *path, #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) #define LOCK_MOUNT_EXACT(mp, path) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ - lock_mount_exact((path), &mp) + lock_mount_exact((path), &mp, false, 0) +#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + lock_mount_exact((path), &mp, true, (copy_flags)) static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { @@ -3073,16 +3077,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags) return file; } -DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *, - if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T)) - static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) { - struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL; - struct path to_path __free(path_put) = {}; struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct user_namespace *user_ns = current_user_ns(); - struct mount *new_ns_root; + struct mnt_namespace *new_ns; + struct mount *new_ns_root, *old_ns_root; + struct path to_path; struct mount *mnt; unsigned int copy_flags = 0; bool locked = false; @@ -3094,71 +3095,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in if (IS_ERR(new_ns)) return ERR_CAST(new_ns); - scoped_guard(namespace_excl) { - new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); - if (IS_ERR(new_ns_root)) - return ERR_CAST(new_ns_root); + old_ns_root = ns->root; + to_path.mnt = &old_ns_root->mnt; + to_path.dentry = old_ns_root->mnt.mnt_root; - /* - * If the real rootfs had a locked mount on top of it somewhere - * in the stack, lock the new mount tree as well so it can't be - * exposed. - */ - mnt = ns->root; - while (mnt->overmount) { - mnt = mnt->overmount; - if (mnt->mnt.mnt_flags & MNT_LOCKED) - locked = true; - } + VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type); + + LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags); + if (IS_ERR(mp.parent)) { + free_mnt_ns(new_ns); + return ERR_CAST(mp.parent); } + new_ns_root = mp.parent; /* - * We dropped the namespace semaphore so we can actually lock - * the copy for mounting. The copied mount isn't attached to any - * mount namespace and it is thus excluded from any propagation. - * So realistically we're isolated and the mount can't be - * overmounted. + * If the real rootfs had a locked mount on top of it somewhere + * in the stack, lock the new mount tree as well so it can't be + * exposed. */ - - /* Borrow the reference from clone_mnt(). */ - to_path.mnt = &new_ns_root->mnt; - to_path.dentry = dget(new_ns_root->mnt.mnt_root); - - /* Now lock for actual mounting. */ - LOCK_MOUNT_EXACT(mp, &to_path); - if (unlikely(IS_ERR(mp.parent))) - return ERR_CAST(mp.parent); + mnt = old_ns_root; + while (mnt->overmount) { + mnt = mnt->overmount; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + locked = true; + } /* - * We don't emulate unshare()ing a mount namespace. We stick to the - * restrictions of creating detached bind-mounts. It has a lot - * saner and simpler semantics. + * We don't emulate unshare()ing a mount namespace. We stick + * to the restrictions of creating detached bind-mounts. It + * has a lot saner and simpler semantics. */ mnt = __do_loopback(path, flags, copy_flags); - if (IS_ERR(mnt)) - return ERR_CAST(mnt); - scoped_guard(mount_writer) { + if (IS_ERR(mnt)) { + emptied_ns = new_ns; + umount_tree(new_ns_root, 0); + return ERR_CAST(mnt); + } + if (locked) mnt->mnt.mnt_flags |= MNT_LOCKED; /* - * Now mount the detached tree on top of the copy of the - * real rootfs we created. + * now mount the detached tree on top of the copy + * of the real rootfs we created. */ attach_mnt(mnt, new_ns_root, mp.mp); if (user_ns != ns->user_ns) lock_mnt_tree(new_ns_root); } - /* Add all mounts to the new namespace. */ - for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) { - mnt_add_to_ns(new_ns, p); + for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) { + mnt_add_to_ns(new_ns, mnt); new_ns->nr_mounts++; } - new_ns->root = real_mount(no_free_ptr(to_path.mnt)); + new_ns->root = new_ns_root; ns_tree_add_raw(new_ns); - return no_free_ptr(new_ns); + return new_ns; } static struct file *open_new_namespace(struct path *path, unsigned int flags) @@ -3840,16 +3833,20 @@ static int do_new_mount(const struct path *path, const char *fstype, } static void lock_mount_exact(const struct path *path, - struct pinned_mountpoint *mp) + struct pinned_mountpoint *mp, bool copy_mount, + unsigned int copy_flags) { struct dentry *dentry = path->dentry; int err; + /* Assert that inode_lock() locked the correct inode. */ + VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path)); + inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) err = -ENOENT; - else if (path_overmounted(path)) + else if (!copy_mount && path_overmounted(path)) err = -EBUSY; else err = get_mountpoint(dentry, mp); @@ -3857,9 +3854,15 @@ static void lock_mount_exact(const struct path *path, namespace_unlock(); inode_unlock(dentry->d_inode); mp->parent = ERR_PTR(err); - } else { - mp->parent = real_mount(path->mnt); + return; } + + if (copy_mount) + mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags); + else + mp->parent = real_mount(path->mnt); + if (unlikely(IS_ERR(mp->parent))) + __unlock_mount(mp); } int finish_automount(struct vfsmount *__m, const struct path *path) |
