From 8649c322f75c96e7ced2fec201e123b2b073bf09 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 7 Jan 2020 09:59:25 -0800 Subject: pid: Implement pidfd_getfd syscall This syscall allows for the retrieval of file descriptors from other processes, based on their pidfd. This is possible using ptrace, and injection of parasitic code to inject code which leverages SCM_RIGHTS to move file descriptors between a tracee and a tracer. Unfortunately, ptrace comes with a high cost of requiring the process to be stopped, and breaks debuggers. This does not require stopping the process under manipulation. One reason to use this is to allow sandboxers to take actions on file descriptors on the behalf of another process. For example, this can be combined with seccomp-bpf's user notification to do on-demand fd extraction and take privileged actions. One such privileged action is binding a socket to a privileged port. /* prototype */ /* flags is currently reserved and should be set to 0 */ int sys_pidfd_getfd(int pidfd, int fd, unsigned int flags); /* testing */ Ran self-test suite on x86_64 Signed-off-by: Sargun Dhillon Acked-by: Christian Brauner Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20200107175927.4558-3-sargun@sargun.me Signed-off-by: Christian Brauner --- kernel/pid.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index 2278e249141d..0f4ecb57214c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -578,3 +578,93 @@ void __init pid_idr_init(void) init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } + +static struct file *__pidfd_fget(struct task_struct *task, int fd) +{ + struct file *file; + int ret; + + ret = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (ret) + return ERR_PTR(ret); + + if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) + file = fget_task(task, fd); + else + file = ERR_PTR(-EPERM); + + mutex_unlock(&task->signal->cred_guard_mutex); + + return file ?: ERR_PTR(-EBADF); +} + +static int pidfd_getfd(struct pid *pid, int fd) +{ + struct task_struct *task; + struct file *file; + int ret; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + file = __pidfd_fget(task, fd); + put_task_struct(task); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = security_file_receive(file); + if (ret) { + fput(file); + return ret; + } + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) + fput(file); + else + fd_install(ret, file); + + return ret; +} + +/** + * sys_pidfd_getfd() - Get a file descriptor from another process + * + * @pidfd: the pidfd file descriptor of the process + * @fd: the file descriptor number to get + * @flags: flags on how to get the fd (reserved) + * + * This syscall gets a copy of a file descriptor from another process + * based on the pidfd, and file descriptor number. It requires that + * the calling process has the ability to ptrace the process represented + * by the pidfd. The process which is having its file descriptor copied + * is otherwise unaffected. + * + * Return: On success, a cloexec file descriptor is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, + unsigned int, flags) +{ + struct pid *pid; + struct fd f; + int ret; + + /* flags is currently unused - make sure it's unset */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + pid = pidfd_pid(f.file); + if (IS_ERR(pid)) + ret = PTR_ERR(pid); + else + ret = pidfd_getfd(pid, fd); + + fdput(f); + return ret; +} -- cgit v1.2.3 From b26ebfe12f34f372cf041c6f801fa49c3fb382c5 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 6 Mar 2020 11:23:14 -0600 Subject: pid: Fix error return value in some cases Recent changes to alloc_pid() allow the pid number to be specified on the command line. If set_tid_size is set, then the code scanning the levels will hard-set retval to -EPERM, overriding it's previous -ENOMEM value. After the code scanning the levels, there are error returns that do not set retval, assuming it is still set to -ENOMEM. So set retval back to -ENOMEM after scanning the levels. Fixes: 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") Signed-off-by: Corey Minyard Acked-by: Christian Brauner Cc: Andrei Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Oleg Nesterov Cc: Adrian Reber Cc: # 5.5 Link: https://lore.kernel.org/r/20200306172314.12232-1-minyard@acm.org [christian.brauner@ubuntu.com: fixup commit message] Signed-off-by: Christian Brauner --- kernel/pid.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index 0f4ecb57214c..19645b25b77c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + retval = -ENOMEM; + if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; -- cgit v1.2.3 From 10dab84caf400f2f5f8b010ebb0c7c4272ec5093 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 8 Mar 2020 14:29:17 +0100 Subject: pid: make ENOMEM return value more obvious The alloc_pid() codepath used to be simpler. With the introducation of the ability to choose specific pids in 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") it got more complex. It hasn't been super obvious that ENOMEM is returned when the pid namespace init process/child subreaper of the pid namespace has died. As can be seen from multiple attempts to improve this see e.g. [1] and most recently [2]. We regressed returning ENOMEM in [3] and [2] restored it. Let's add a comment on top explaining that this is historic and documented behavior and cannot easily be changed. [1]: 35f71bc0a09a ("fork: report pid reservation failure properly") [2]: b26ebfe12f34 ("pid: Fix error return value in some cases") [3]: 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") Signed-off-by: Christian Brauner --- kernel/pid.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index 19645b25b77c..647b4bb457b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,14 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ retval = -ENOMEM; if (unlikely(is_child_reaper(pid))) { -- cgit v1.2.3