summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/fhandle.c115
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/pidfs.c298
4 files changed, 321 insertions, 103 deletions
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ec9145047dfc..3e092ae6d142 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
return 0;
}
-enum handle_to_path_flags {
- HANDLE_CHECK_PERMS = (1 << 0),
- HANDLE_CHECK_SUBTREE = (1 << 1),
-};
-
-struct handle_to_path_ctx {
- struct path root;
- enum handle_to_path_flags flags;
- unsigned int fh_flags;
-};
-
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{
struct handle_to_path_ctx *ctx = context;
@@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
{
int handle_dwords;
struct vfsmount *mnt = ctx->root.mnt;
+ struct dentry *dentry;
/* change the handle size to multiple of sizeof(u32) */
handle_dwords = handle->handle_bytes >> 2;
- path->dentry = exportfs_decode_fh_raw(mnt,
- (struct fid *)handle->f_handle,
- handle_dwords, handle->handle_type,
- ctx->fh_flags,
- vfs_dentry_acceptable, ctx);
- if (IS_ERR_OR_NULL(path->dentry)) {
- if (path->dentry == ERR_PTR(-ENOMEM))
+ dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
+ handle_dwords, handle->handle_type,
+ ctx->fh_flags, vfs_dentry_acceptable,
+ ctx);
+ if (IS_ERR_OR_NULL(dentry)) {
+ if (dentry == ERR_PTR(-ENOMEM))
return -ENOMEM;
return -ESTALE;
}
+ path->dentry = dentry;
path->mnt = mntget(mnt);
return 0;
}
-/*
- * Allow relaxed permissions of file handles if the caller has the
- * ability to mount the filesystem or create a bind-mount of the
- * provided @mountdirfd.
- *
- * In both cases the caller may be able to get an unobstructed way to
- * the encoded file handle. If the caller is only able to create a
- * bind-mount we need to verify that there are no locked mounts on top
- * of it that could prevent us from getting to the encoded file.
- *
- * In principle, locked mounts can prevent the caller from mounting the
- * filesystem but that only applies to procfs and sysfs neither of which
- * support decoding file handles.
- */
-static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
- unsigned int o_flags)
+static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
+ unsigned int o_flags)
{
struct path *root = &ctx->root;
+ if (capable(CAP_DAC_READ_SEARCH))
+ return 0;
+
/*
- * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
- * confusing api in the face of disconnected non-dir dentries.
+ * Allow relaxed permissions of file handles if the caller has
+ * the ability to mount the filesystem or create a bind-mount of
+ * the provided @mountdirfd.
+ *
+ * In both cases the caller may be able to get an unobstructed
+ * way to the encoded file handle. If the caller is only able to
+ * create a bind-mount we need to verify that there are no
+ * locked mounts on top of it that could prevent us from getting
+ * to the encoded file.
+ *
+ * In principle, locked mounts can prevent the caller from
+ * mounting the filesystem but that only applies to procfs and
+ * sysfs neither of which support decoding file handles.
+ *
+ * Restrict to O_DIRECTORY to provide a deterministic API that
+ * avoids a confusing api in the face of disconnected non-dir
+ * dentries.
*
* There's only one dentry for each directory inode (VFS rule)...
*/
if (!(o_flags & O_DIRECTORY))
- return false;
+ return -EPERM;
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
ctx->flags = HANDLE_CHECK_PERMS;
@@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
!has_locked_children(real_mount(root->mnt), root->dentry))
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
else
- return false;
+ return -EPERM;
/* Are we able to override DAC permissions? */
if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
- return false;
+ return -EPERM;
ctx->fh_flags = EXPORT_FH_DIR_ONLY;
- return true;
+ return 0;
}
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
struct file_handle f_handle;
struct file_handle *handle = NULL;
struct handle_to_path_ctx ctx = {};
+ const struct export_operations *eops;
retval = get_path_from_fd(mountdirfd, &ctx.root);
if (retval)
goto out_err;
- if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
- retval = -EPERM;
+ eops = ctx.root.mnt->mnt_sb->s_export_op;
+ if (eops && eops->permission)
+ retval = eops->permission(&ctx, o_flags);
+ else
+ retval = may_decode_fh(&ctx, o_flags);
+ if (retval)
goto out_path;
- }
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT;
@@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
int open_flag)
{
long retval = 0;
- struct path path;
+ struct path path __free(path_put) = {};
struct file *file;
- int fd;
+ const struct export_operations *eops;
retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval)
return retval;
- fd = get_unused_fd_flags(open_flag);
- if (fd < 0) {
- path_put(&path);
+ CLASS(get_unused_fd, fd)(O_CLOEXEC);
+ if (fd < 0)
return fd;
- }
- file = file_open_root(&path, "", open_flag, 0);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- retval = PTR_ERR(file);
- } else {
- retval = fd;
- fd_install(fd, file);
- }
- path_put(&path);
- return retval;
+
+ eops = path.mnt->mnt_sb->s_export_op;
+ if (eops->open)
+ file = eops->open(&path, open_flag);
+ else
+ file = file_open_root(&path, "", open_flag, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ fd_install(fd, file);
+ return take_fd(fd);
}
/**
diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..2890a9c4a414 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = ctx->magic;
s->s_op = ctx->ops ?: &simple_super_operations;
+ s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
root = new_inode(s);
diff --git a/fs/namespace.c b/fs/namespace.c
index 851af89e8d72..64deda6f5b2c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
+#include <linux/pidfs.h>
#include <linux/nospec.h>
#include "pnode.h"
@@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (IS_MNT_UNBINDABLE(old))
return mnt;
- if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
- return mnt;
+ if (!check_mnt(old)) {
+ const struct dentry_operations *d_op = old_path->dentry->d_op;
+
+ if (d_op != &ns_dentry_operations &&
+ d_op != &pidfs_dentry_operations)
+ return mnt;
+ }
if (!recurse && has_locked_children(old, old_path->dentry))
return mnt;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 618abb1fa1b8..049352f973de 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/anon_inodes.h>
+#include <linux/exportfs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/cgroup.h>
@@ -23,6 +24,97 @@
#include "internal.h"
#include "mount.h"
+static struct rb_root pidfs_ino_tree = RB_ROOT;
+
+#if BITS_PER_LONG == 32
+static inline unsigned long pidfs_ino(u64 ino)
+{
+ return lower_32_bits(ino);
+}
+
+/* On 32 bit the generation number are the upper 32 bits. */
+static inline u32 pidfs_gen(u64 ino)
+{
+ return upper_32_bits(ino);
+}
+
+#else
+
+/* On 64 bit simply return ino. */
+static inline unsigned long pidfs_ino(u64 ino)
+{
+ return ino;
+}
+
+/* On 64 bit the generation number is 0. */
+static inline u32 pidfs_gen(u64 ino)
+{
+ return 0;
+}
+#endif
+
+static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
+ struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
+ u64 pid_ino_a = pid_a->ino;
+ u64 pid_ino_b = pid_b->ino;
+
+ if (pid_ino_a < pid_ino_b)
+ return -1;
+ if (pid_ino_a > pid_ino_b)
+ return 1;
+ return 0;
+}
+
+void pidfs_add_pid(struct pid *pid)
+{
+ static u64 pidfs_ino_nr = 2;
+
+ /*
+ * On 64 bit nothing special happens. The 64bit number assigned
+ * to struct pid is the inode number.
+ *
+ * On 32 bit the 64 bit number assigned to struct pid is split
+ * into two 32 bit numbers. The lower 32 bits are used as the
+ * inode number and the upper 32 bits are used as the inode
+ * generation number.
+ *
+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
+ * pidfs_ino() returns zero a wrap around happened. When a
+ * wraparound happens the 64 bit number will be incremented by 2
+ * so inode numbering starts at 2 again.
+ *
+ * On 64 bit comparing two pidfds is as simple as comparing
+ * inode numbers.
+ *
+ * When a wraparound happens on 32 bit multiple pidfds with the
+ * same inode number are likely to exist (This isn't a problem
+ * since before pidfs pidfds used the anonymous inode meaning
+ * all pidfds had the same inode number.). Userspace can
+ * reconstruct the 64 bit identifier by retrieving both the
+ * inode number and the inode generation number to compare or
+ * use file handles.
+ */
+ if (pidfs_ino(pidfs_ino_nr) == 0)
+ pidfs_ino_nr += 2;
+
+ pid->ino = pidfs_ino_nr;
+ pid->stashed = NULL;
+ pidfs_ino_nr++;
+
+ write_seqcount_begin(&pidmap_lock_seq);
+ rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
+ write_seqcount_end(&pidmap_lock_seq);
+}
+
+void pidfs_remove_pid(struct pid *pid)
+{
+ write_seqcount_begin(&pidmap_lock_seq);
+ rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
+ write_seqcount_end(&pidmap_lock_seq);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
return 0;
}
+static bool pidfs_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case FS_IOC_GETVERSION:
+ case PIDFD_GET_CGROUP_NAMESPACE:
+ case PIDFD_GET_INFO:
+ case PIDFD_GET_IPC_NAMESPACE:
+ case PIDFD_GET_MNT_NAMESPACE:
+ case PIDFD_GET_NET_NAMESPACE:
+ case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
+ case PIDFD_GET_TIME_NAMESPACE:
+ case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
+ case PIDFD_GET_UTS_NAMESPACE:
+ case PIDFD_GET_USER_NAMESPACE:
+ case PIDFD_GET_PID_NAMESPACE:
+ return true;
+ }
+
+ return false;
+}
+
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
+ if (!pidfs_ioctl_valid(cmd))
+ return -ENOIOCTLCMD;
+
+ if (cmd == FS_IOC_GETVERSION) {
+ if (!arg)
+ return -EINVAL;
+
+ __u32 __user *argp = (__u32 __user *)arg;
+ return put_user(file_inode(file)->i_generation, argp);
+ }
+
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
@@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file)
static struct vfsmount *pidfs_mnt __ro_after_init;
-#if BITS_PER_LONG == 32
-/*
- * Provide a fallback mechanism for 32-bit systems so processes remain
- * reliably comparable by inode number even on those systems.
- */
-static DEFINE_IDA(pidfd_inum_ida);
-
-static int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
- int ret;
-
- ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
- UINT_MAX, GFP_ATOMIC);
- if (ret < 0)
- return -ENOSPC;
-
- *ino = ret;
- return 0;
-}
-
-static inline void pidfs_free_inum(unsigned long ino)
-{
- if (ino > 0)
- ida_free(&pidfd_inum_ida, ino);
-}
-#else
-static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
- *ino = pid->ino;
- return 0;
-}
-#define pidfs_free_inum(ino) ((void)(ino))
-#endif
-
/*
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
* implemented. Let's reject it completely until we have a clean
@@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode)
clear_inode(inode);
put_pid(pid);
- pidfs_free_inum(inode->i_ino);
}
static const struct super_operations pidfs_sops = {
@@ -421,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
}
-static const struct dentry_operations pidfs_dentry_operations = {
+const struct dentry_operations pidfs_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = pidfs_dname,
.d_prune = stashed_dentry_prune,
};
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ const struct pid *pid = inode->i_private;
+
+ if (*max_len < 2) {
+ *max_len = 2;
+ return FILEID_INVALID;
+ }
+
+ *max_len = 2;
+ *(u64 *)fh = pid->ino;
+ return FILEID_KERNFS;
+}
+
+static int pidfs_ino_find(const void *key, const struct rb_node *node)
+{
+ const u64 pid_ino = *(u64 *)key;
+ const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
+
+ if (pid_ino < pid->ino)
+ return -1;
+ if (pid_ino > pid->ino)
+ return 1;
+ return 0;
+}
+
+/* Find a struct pid based on the inode number. */
+static struct pid *pidfs_ino_get_pid(u64 ino)
+{
+ struct pid *pid;
+ struct rb_node *node;
+ unsigned int seq;
+
+ guard(rcu)();
+ do {
+ seq = read_seqcount_begin(&pidmap_lock_seq);
+ node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
+ if (node)
+ break;
+ } while (read_seqcount_retry(&pidmap_lock_seq, seq));
+
+ if (!node)
+ return NULL;
+
+ pid = rb_entry(node, struct pid, pidfs_node);
+
+ /* Within our pid namespace hierarchy? */
+ if (pid_vnr(pid) == 0)
+ return NULL;
+
+ return get_pid(pid);
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type)
+{
+ int ret;
+ u64 pid_ino;
+ struct path path;
+ struct pid *pid;
+
+ if (fh_len < 2)
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_KERNFS:
+ pid_ino = *(u64 *)fid;
+ break;
+ default:
+ return NULL;
+ }
+
+ pid = pidfs_ino_get_pid(pid_ino);
+ if (!pid)
+ return NULL;
+
+ ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ mntput(path.mnt);
+ return path.dentry;
+}
+
+/*
+ * Make sure that we reject any nonsensical flags that users pass via
+ * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
+ * PIDFD_NONBLOCK as O_NONBLOCK.
+ */
+#define VALID_FILE_HANDLE_OPEN_FLAGS \
+ (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
+
+static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
+ unsigned int oflags)
+{
+ if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
+ return -EINVAL;
+
+ /*
+ * pidfd_ino_get_pid() will verify that the struct pid is part
+ * of the caller's pid namespace hierarchy. No further
+ * permission checks are needed.
+ */
+ return 0;
+}
+
+static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
+{
+ /*
+ * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
+ * O_RDWR as pidfds always are.
+ */
+ oflags &= ~O_LARGEFILE;
+ return dentry_open(path, oflags | O_RDWR, current_cred());
+}
+
+static const struct export_operations pidfs_export_operations = {
+ .encode_fh = pidfs_encode_fh,
+ .fh_to_dentry = pidfs_fh_to_dentry,
+ .open = pidfs_export_open,
+ .permission = pidfs_export_permission,
+};
+
static int pidfs_init_inode(struct inode *inode, void *data)
{
+ const struct pid *pid = data;
+
inode->i_private = data;
inode->i_flags |= S_PRIVATE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
- /*
- * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
- * avoids collisions with the root inode which is 1 for pseudo
- * filesystems.
- */
- return pidfs_inum(data, &inode->i_ino);
+ inode->i_ino = pidfs_ino(pid->ino);
+ inode->i_generation = pidfs_gen(pid->ino);
+ return 0;
}
static void pidfs_put_data(void *data)
@@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;
ctx->ops = &pidfs_sops;
+ ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0;