diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/fhandle.c | 115 | ||||
-rw-r--r-- | fs/libfs.c | 1 | ||||
-rw-r--r-- | fs/namespace.c | 10 | ||||
-rw-r--r-- | fs/pidfs.c | 298 |
4 files changed, 321 insertions, 103 deletions
diff --git a/fs/fhandle.c b/fs/fhandle.c index ec9145047dfc..3e092ae6d142 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root) return 0; } -enum handle_to_path_flags { - HANDLE_CHECK_PERMS = (1 << 0), - HANDLE_CHECK_SUBTREE = (1 << 1), -}; - -struct handle_to_path_ctx { - struct path root; - enum handle_to_path_flags flags; - unsigned int fh_flags; -}; - static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { struct handle_to_path_ctx *ctx = context; @@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path, { int handle_dwords; struct vfsmount *mnt = ctx->root.mnt; + struct dentry *dentry; /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; - path->dentry = exportfs_decode_fh_raw(mnt, - (struct fid *)handle->f_handle, - handle_dwords, handle->handle_type, - ctx->fh_flags, - vfs_dentry_acceptable, ctx); - if (IS_ERR_OR_NULL(path->dentry)) { - if (path->dentry == ERR_PTR(-ENOMEM)) + dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + ctx->fh_flags, vfs_dentry_acceptable, + ctx); + if (IS_ERR_OR_NULL(dentry)) { + if (dentry == ERR_PTR(-ENOMEM)) return -ENOMEM; return -ESTALE; } + path->dentry = dentry; path->mnt = mntget(mnt); return 0; } -/* - * Allow relaxed permissions of file handles if the caller has the - * ability to mount the filesystem or create a bind-mount of the - * provided @mountdirfd. - * - * In both cases the caller may be able to get an unobstructed way to - * the encoded file handle. If the caller is only able to create a - * bind-mount we need to verify that there are no locked mounts on top - * of it that could prevent us from getting to the encoded file. - * - * In principle, locked mounts can prevent the caller from mounting the - * filesystem but that only applies to procfs and sysfs neither of which - * support decoding file handles. - */ -static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, - unsigned int o_flags) +static inline int may_decode_fh(struct handle_to_path_ctx *ctx, + unsigned int o_flags) { struct path *root = &ctx->root; + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + /* - * Restrict to O_DIRECTORY to provide a deterministic API that avoids a - * confusing api in the face of disconnected non-dir dentries. + * Allow relaxed permissions of file handles if the caller has + * the ability to mount the filesystem or create a bind-mount of + * the provided @mountdirfd. + * + * In both cases the caller may be able to get an unobstructed + * way to the encoded file handle. If the caller is only able to + * create a bind-mount we need to verify that there are no + * locked mounts on top of it that could prevent us from getting + * to the encoded file. + * + * In principle, locked mounts can prevent the caller from + * mounting the filesystem but that only applies to procfs and + * sysfs neither of which support decoding file handles. + * + * Restrict to O_DIRECTORY to provide a deterministic API that + * avoids a confusing api in the face of disconnected non-dir + * dentries. * * There's only one dentry for each directory inode (VFS rule)... */ if (!(o_flags & O_DIRECTORY)) - return false; + return -EPERM; if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) ctx->flags = HANDLE_CHECK_PERMS; @@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, !has_locked_children(real_mount(root->mnt), root->dentry)) ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; else - return false; + return -EPERM; /* Are we able to override DAC permissions? */ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) - return false; + return -EPERM; ctx->fh_flags = EXPORT_FH_DIR_ONLY; - return true; + return 0; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, @@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct file_handle f_handle; struct file_handle *handle = NULL; struct handle_to_path_ctx ctx = {}; + const struct export_operations *eops; retval = get_path_from_fd(mountdirfd, &ctx.root); if (retval) goto out_err; - if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { - retval = -EPERM; + eops = ctx.root.mnt->mnt_sb->s_export_op; + if (eops && eops->permission) + retval = eops->permission(&ctx, o_flags); + else + retval = may_decode_fh(&ctx, o_flags); + if (retval) goto out_path; - } if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; @@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval = 0; - struct path path; + struct path path __free(path_put) = {}; struct file *file; - int fd; + const struct export_operations *eops; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; - fd = get_unused_fd_flags(open_flag); - if (fd < 0) { - path_put(&path); + CLASS(get_unused_fd, fd)(O_CLOEXEC); + if (fd < 0) return fd; - } - file = file_open_root(&path, "", open_flag, 0); - if (IS_ERR(file)) { - put_unused_fd(fd); - retval = PTR_ERR(file); - } else { - retval = fd; - fd_install(fd, file); - } - path_put(&path); - return retval; + + eops = path.mnt->mnt_sb->s_export_op; + if (eops->open) + file = eops->open(&path, open_flag); + else + file = file_open_root(&path, "", open_flag, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + fd_install(fd, file); + return take_fd(fd); } /** diff --git a/fs/libfs.c b/fs/libfs.c index 748ac5923154..2890a9c4a414 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; + s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); diff --git a/fs/namespace.c b/fs/namespace.c index 851af89e8d72..64deda6f5b2c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,7 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> +#include <linux/pidfs.h> #include <linux/nospec.h> #include "pnode.h" @@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) - return mnt; + if (!check_mnt(old)) { + const struct dentry_operations *d_op = old_path->dentry->d_op; + + if (d_op != &ns_dentry_operations && + d_op != &pidfs_dentry_operations) + return mnt; + } if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; diff --git a/fs/pidfs.c b/fs/pidfs.c index 618abb1fa1b8..049352f973de 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> +#include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/cgroup.h> @@ -23,6 +24,97 @@ #include "internal.h" #include "mount.h" +static struct rb_root pidfs_ino_tree = RB_ROOT; + +#if BITS_PER_LONG == 32 +static inline unsigned long pidfs_ino(u64 ino) +{ + return lower_32_bits(ino); +} + +/* On 32 bit the generation number are the upper 32 bits. */ +static inline u32 pidfs_gen(u64 ino) +{ + return upper_32_bits(ino); +} + +#else + +/* On 64 bit simply return ino. */ +static inline unsigned long pidfs_ino(u64 ino) +{ + return ino; +} + +/* On 64 bit the generation number is 0. */ +static inline u32 pidfs_gen(u64 ino) +{ + return 0; +} +#endif + +static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); + struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); + u64 pid_ino_a = pid_a->ino; + u64 pid_ino_b = pid_b->ino; + + if (pid_ino_a < pid_ino_b) + return -1; + if (pid_ino_a > pid_ino_b) + return 1; + return 0; +} + +void pidfs_add_pid(struct pid *pid) +{ + static u64 pidfs_ino_nr = 2; + + /* + * On 64 bit nothing special happens. The 64bit number assigned + * to struct pid is the inode number. + * + * On 32 bit the 64 bit number assigned to struct pid is split + * into two 32 bit numbers. The lower 32 bits are used as the + * inode number and the upper 32 bits are used as the inode + * generation number. + * + * On 32 bit pidfs_ino() will return the lower 32 bit. When + * pidfs_ino() returns zero a wrap around happened. When a + * wraparound happens the 64 bit number will be incremented by 2 + * so inode numbering starts at 2 again. + * + * On 64 bit comparing two pidfds is as simple as comparing + * inode numbers. + * + * When a wraparound happens on 32 bit multiple pidfds with the + * same inode number are likely to exist (This isn't a problem + * since before pidfs pidfds used the anonymous inode meaning + * all pidfds had the same inode number.). Userspace can + * reconstruct the 64 bit identifier by retrieving both the + * inode number and the inode generation number to compare or + * use file handles. + */ + if (pidfs_ino(pidfs_ino_nr) == 0) + pidfs_ino_nr += 2; + + pid->ino = pidfs_ino_nr; + pid->stashed = NULL; + pidfs_ino_nr++; + + write_seqcount_begin(&pidmap_lock_seq); + rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); + write_seqcount_end(&pidmap_lock_seq); +} + +void pidfs_remove_pid(struct pid *pid) +{ + write_seqcount_begin(&pidmap_lock_seq); + rb_erase(&pid->pidfs_node, &pidfs_ino_tree); + write_seqcount_end(&pidmap_lock_seq); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long return 0; } +static bool pidfs_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case FS_IOC_GETVERSION: + case PIDFD_GET_CGROUP_NAMESPACE: + case PIDFD_GET_INFO: + case PIDFD_GET_IPC_NAMESPACE: + case PIDFD_GET_MNT_NAMESPACE: + case PIDFD_GET_NET_NAMESPACE: + case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: + case PIDFD_GET_TIME_NAMESPACE: + case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: + case PIDFD_GET_UTS_NAMESPACE: + case PIDFD_GET_USER_NAMESPACE: + case PIDFD_GET_PID_NAMESPACE: + return true; + } + + return false; +} + static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; @@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; + if (!pidfs_ioctl_valid(cmd)) + return -ENOIOCTLCMD; + + if (cmd == FS_IOC_GETVERSION) { + if (!arg) + return -EINVAL; + + __u32 __user *argp = (__u32 __user *)arg; + return put_user(file_inode(file)->i_generation, argp); + } + task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file) static struct vfsmount *pidfs_mnt __ro_after_init; -#if BITS_PER_LONG == 32 -/* - * Provide a fallback mechanism for 32-bit systems so processes remain - * reliably comparable by inode number even on those systems. - */ -static DEFINE_IDA(pidfd_inum_ida); - -static int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - int ret; - - ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, - UINT_MAX, GFP_ATOMIC); - if (ret < 0) - return -ENOSPC; - - *ino = ret; - return 0; -} - -static inline void pidfs_free_inum(unsigned long ino) -{ - if (ino > 0) - ida_free(&pidfd_inum_ida, ino); -} -#else -static inline int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - *ino = pid->ino; - return 0; -} -#define pidfs_free_inum(ino) ((void)(ino)) -#endif - /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean @@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode) clear_inode(inode); put_pid(pid); - pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { @@ -421,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } -static const struct dentry_operations pidfs_dentry_operations = { +const struct dentry_operations pidfs_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + const struct pid *pid = inode->i_private; + + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + *max_len = 2; + *(u64 *)fh = pid->ino; + return FILEID_KERNFS; +} + +static int pidfs_ino_find(const void *key, const struct rb_node *node) +{ + const u64 pid_ino = *(u64 *)key; + const struct pid *pid = rb_entry(node, struct pid, pidfs_node); + + if (pid_ino < pid->ino) + return -1; + if (pid_ino > pid->ino) + return 1; + return 0; +} + +/* Find a struct pid based on the inode number. */ +static struct pid *pidfs_ino_get_pid(u64 ino) +{ + struct pid *pid; + struct rb_node *node; + unsigned int seq; + + guard(rcu)(); + do { + seq = read_seqcount_begin(&pidmap_lock_seq); + node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); + if (node) + break; + } while (read_seqcount_retry(&pidmap_lock_seq, seq)); + + if (!node) + return NULL; + + pid = rb_entry(node, struct pid, pidfs_node); + + /* Within our pid namespace hierarchy? */ + if (pid_vnr(pid) == 0) + return NULL; + + return get_pid(pid); +} + +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + int ret; + u64 pid_ino; + struct path path; + struct pid *pid; + + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_KERNFS: + pid_ino = *(u64 *)fid; + break; + default: + return NULL; + } + + pid = pidfs_ino_get_pid(pid_ino); + if (!pid) + return NULL; + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); + if (ret < 0) + return ERR_PTR(ret); + + mntput(path.mnt); + return path.dentry; +} + +/* + * Make sure that we reject any nonsensical flags that users pass via + * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and + * PIDFD_NONBLOCK as O_NONBLOCK. + */ +#define VALID_FILE_HANDLE_OPEN_FLAGS \ + (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) + +static int pidfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) + return -EINVAL; + + /* + * pidfd_ino_get_pid() will verify that the struct pid is part + * of the caller's pid namespace hierarchy. No further + * permission checks are needed. + */ + return 0; +} + +static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +{ + /* + * Clear O_LARGEFILE as open_by_handle_at() forces it and raise + * O_RDWR as pidfds always are. + */ + oflags &= ~O_LARGEFILE; + return dentry_open(path, oflags | O_RDWR, current_cred()); +} + +static const struct export_operations pidfs_export_operations = { + .encode_fh = pidfs_encode_fh, + .fh_to_dentry = pidfs_fh_to_dentry, + .open = pidfs_export_open, + .permission = pidfs_export_permission, +}; + static int pidfs_init_inode(struct inode *inode, void *data) { + const struct pid *pid = data; + inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; - /* - * Inode numbering for pidfs start at RESERVED_PIDS + 1. This - * avoids collisions with the root inode which is 1 for pseudo - * filesystems. - */ - return pidfs_inum(data, &inode->i_ino); + inode->i_ino = pidfs_ino(pid->ino); + inode->i_generation = pidfs_gen(pid->ino); + return 0; } static void pidfs_put_data(void *data) @@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->ops = &pidfs_sops; + ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; |