From 58fa4a410fc31afe08d0d0c6b6d8860c22ec17c2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 16 Jan 2019 14:15:20 +0100 Subject: ipc: introduce ksys_ipc()/compat_ksys_ipc() for s390 The sys_ipc() and compat_ksys_ipc() functions are meant to only be used from the system call table, not called by another function. Introduce ksys_*() interfaces for this purpose, as we have done for many other system calls. Link: https://lore.kernel.org/lkml/20190116131527.2071570-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Heiko Carstens [heiko.carstens@de.ibm.com: compile fix for !CONFIG_COMPAT] Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- include/linux/syscalls.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..fb63045a0fb6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1185,6 +1185,10 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); ssize_t ksys_readahead(int fd, loff_t offset, size_t count); +int ksys_ipc(unsigned int call, int first, unsigned long second, + unsigned long third, void __user * ptr, long fifth); +int compat_ksys_ipc(u32 call, int first, int second, + u32 third, u32 ptr, u32 fifth); /* * The following kernel syscall equivalents are just wrappers to fs-internal -- cgit v1.2.3 From 275f22148e8720e84b180d9e0cdf8abfd69bac5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 31 Dec 2018 22:22:40 +0100 Subject: ipc: rename old-style shmctl/semctl/msgctl syscalls The behavior of these system calls is slightly different between architectures, as determined by the CONFIG_ARCH_WANT_IPC_PARSE_VERSION symbol. Most architectures that implement the split IPC syscalls don't set that symbol and only get the modern version, but alpha, arm, microblaze, mips-n32, mips-n64 and xtensa expect the caller to pass the IPC_64 flag. For the architectures that so far only implement sys_ipc(), i.e. m68k, mips-o32, powerpc, s390, sh, sparc, and x86-32, we want the new behavior when adding the split syscalls, so we need to distinguish between the two groups of architectures. The method I picked for this distinction is to have a separate system call entry point: sys_old_*ctl() now uses ipc_parse_version, while sys_*ctl() does not. The system call tables of the five architectures are changed accordingly. As an additional benefit, we no longer need the configuration specific definition for ipc_parse_version(), it always does the same thing now, but simply won't get called on architectures with the modern interface. A small downside is that on architectures that do set ARCH_WANT_IPC_PARSE_VERSION, we now have an extra set of entry points that are never called. They only add a few bytes of bloat, so it seems better to keep them compared to adding yet another Kconfig symbol. I considered adding new syscall numbers for the IPC_64 variants for consistency, but decided against that for now. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fb63045a0fb6..938d8908b9e0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -717,6 +717,7 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqst /* ipc/msg.c */ asmlinkage long sys_msgget(key_t key, int msgflg); +asmlinkage long sys_old_msgctl(int msqid, int cmd, struct msqid_ds __user *buf); asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf); asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, long msgtyp, int msgflg); @@ -726,6 +727,7 @@ asmlinkage long sys_msgsnd(int msqid, struct msgbuf __user *msgp, /* ipc/sem.c */ asmlinkage long sys_semget(key_t key, int nsems, int semflg); asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg); +asmlinkage long sys_old_semctl(int semid, int semnum, int cmd, unsigned long arg); asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops, unsigned nsops, const struct __kernel_timespec __user *timeout); @@ -734,6 +736,7 @@ asmlinkage long sys_semop(int semid, struct sembuf __user *sops, /* ipc/shm.c */ asmlinkage long sys_shmget(key_t key, size_t size, int flag); +asmlinkage long sys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg); asmlinkage long sys_shmdt(char __user *shmaddr); -- cgit v1.2.3 From 50b93f30f6d8672f9ec80e90af94d733f11a20e0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 1 Jan 2019 17:34:39 +0100 Subject: time: fix sys_timer_settime prototype A small typo has crept into the y2038 conversion of the timer_settime system call. So far this was completely harmless, but once we start using the new version, this has to be fixed. Fixes: 6ff847350702 ("time: Change types to new y2038 safe __kernel_itimerspec") Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 938d8908b9e0..baa4b70b02d3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -591,7 +591,7 @@ asmlinkage long sys_timer_gettime(timer_t timer_id, asmlinkage long sys_timer_getoverrun(timer_t timer_id); asmlinkage long sys_timer_settime(timer_t timer_id, int flags, const struct __kernel_itimerspec __user *new_setting, - struct itimerspec __user *old_setting); + struct __kernel_itimerspec __user *old_setting); asmlinkage long sys_timer_delete(timer_t timer_id); asmlinkage long sys_clock_settime(clockid_t which_clock, const struct __kernel_timespec __user *tp); -- cgit v1.2.3 From 3876ced476c8ec17265d1739467e726ada88b660 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 2 Jul 2018 22:44:22 -0700 Subject: timex: change syscalls to use struct __kernel_timex struct timex is not y2038 safe. Switch all the syscall apis to use y2038 safe __kernel_timex. Note that sys_adjtimex() does not have a y2038 safe solution. C libraries can implement it by calling clock_adjtime(CLOCK_REALTIME, ...). Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index baa4b70b02d3..09330d5bda0c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,7 +54,7 @@ struct __sysctl_args; struct sysinfo; struct timespec; struct timeval; -struct timex; +struct __kernel_timex; struct timezone; struct tms; struct utimbuf; @@ -695,7 +695,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_settimeofday(struct timeval __user *tv, struct timezone __user *tz); -asmlinkage long sys_adjtimex(struct timex __user *txc_p); +asmlinkage long sys_adjtimex(struct __kernel_timex __user *txc_p); /* kernel/timer.c */ asmlinkage long sys_getpid(void); @@ -870,7 +870,7 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); asmlinkage long sys_clock_adjtime(clockid_t which_clock, - struct timex __user *tx); + struct __kernel_timex __user *tx); asmlinkage long sys_syncfs(int fd); asmlinkage long sys_setns(int fd, int nstype); asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg, -- cgit v1.2.3 From 8dabe7245bbc134f2cfcc12cde75c019dab924cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 7 Jan 2019 00:33:08 +0100 Subject: y2038: syscalls: rename y2038 compat syscalls A lot of system calls that pass a time_t somewhere have an implementation using a COMPAT_SYSCALL_DEFINEx() on 64-bit architectures, and have been reworked so that this implementation can now be used on 32-bit architectures as well. The missing step is to redefine them using the regular SYSCALL_DEFINEx() to get them out of the compat namespace and make it possible to build them on 32-bit architectures. Any system call that ends in 'time' gets a '32' suffix on its name for that version, while the others get a '_time32' suffix, to distinguish them from the normal version, which takes a 64-bit time argument in the future. In this step, only 64-bit architectures are changed, doing this rename first lets us avoid touching the 32-bit architectures twice. Acked-by: Catalin Marinas Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 09330d5bda0c..94369f5bd8e5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -297,6 +297,11 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id, long nr, struct io_event __user *events, struct __kernel_timespec __user *timeout); +asmlinkage long sys_io_getevents_time32(__u32 ctx_id, + __s32 min_nr, + __s32 nr, + struct io_event __user *events, + struct old_timespec32 __user *timeout); asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, long min_nr, long nr, @@ -522,11 +527,19 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, const struct __kernel_itimerspec __user *utmr, struct __kernel_itimerspec __user *otmr); asmlinkage long sys_timerfd_gettime(int ufd, struct __kernel_itimerspec __user *otmr); +asmlinkage long sys_timerfd_gettime32(int ufd, + struct old_itimerspec32 __user *otmr); +asmlinkage long sys_timerfd_settime32(int ufd, int flags, + const struct old_itimerspec32 __user *utmr, + struct old_itimerspec32 __user *otmr); /* fs/utimes.c */ asmlinkage long sys_utimensat(int dfd, const char __user *filename, struct __kernel_timespec __user *utimes, int flags); +asmlinkage long sys_utimensat_time32(unsigned int dfd, + const char __user *filename, + struct old_timespec32 __user *t, int flags); /* kernel/acct.c */ asmlinkage long sys_acct(const char __user *name); @@ -555,6 +568,9 @@ asmlinkage long sys_unshare(unsigned long unshare_flags); asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct __kernel_timespec __user *utime, u32 __user *uaddr2, u32 val3); +asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val, + struct old_timespec32 __user *utime, u32 __user *uaddr2, + u32 val3); asmlinkage long sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, size_t __user *len_ptr); @@ -564,6 +580,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, /* kernel/hrtimer.c */ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); +asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, + struct old_timespec32 __user *rmtp); /* kernel/itimer.c */ asmlinkage long sys_getitimer(int which, struct itimerval __user *value); @@ -602,6 +620,20 @@ asmlinkage long sys_clock_getres(clockid_t which_clock, asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, const struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); +asmlinkage long sys_timer_gettime32(timer_t timer_id, + struct old_itimerspec32 __user *setting); +asmlinkage long sys_timer_settime32(timer_t timer_id, int flags, + struct old_itimerspec32 __user *new, + struct old_itimerspec32 __user *old); +asmlinkage long sys_clock_settime32(clockid_t which_clock, + struct old_timespec32 __user *tp); +asmlinkage long sys_clock_gettime32(clockid_t which_clock, + struct old_timespec32 __user *tp); +asmlinkage long sys_clock_getres_time32(clockid_t which_clock, + struct old_timespec32 __user *tp); +asmlinkage long sys_clock_nanosleep_time32(clockid_t which_clock, int flags, + struct old_timespec32 __user *rqtp, + struct old_timespec32 __user *rmtp); /* kernel/printk.c */ asmlinkage long sys_syslog(int type, char __user *buf, int len); @@ -627,6 +659,8 @@ asmlinkage long sys_sched_get_priority_max(int policy); asmlinkage long sys_sched_get_priority_min(int policy); asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct __kernel_timespec __user *interval); +asmlinkage long sys_sched_rr_get_interval_time32(pid_t pid, + struct old_timespec32 __user *interval); /* kernel/signal.c */ asmlinkage long sys_restart_syscall(void); @@ -696,6 +730,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, asmlinkage long sys_settimeofday(struct timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_adjtimex(struct __kernel_timex __user *txc_p); +asmlinkage long sys_adjtimex_time32(struct old_timex32 __user *txc_p); /* kernel/timer.c */ asmlinkage long sys_getpid(void); @@ -714,6 +749,14 @@ asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct __kernel_timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); +asmlinkage long sys_mq_timedreceive_time32(mqd_t mqdes, + char __user *u_msg_ptr, + unsigned int msg_len, unsigned int __user *u_msg_prio, + const struct old_timespec32 __user *u_abs_timeout); +asmlinkage long sys_mq_timedsend_time32(mqd_t mqdes, + const char __user *u_msg_ptr, + unsigned int msg_len, unsigned int msg_prio, + const struct old_timespec32 __user *u_abs_timeout); /* ipc/msg.c */ asmlinkage long sys_msgget(key_t key, int msgflg); @@ -731,6 +774,9 @@ asmlinkage long sys_old_semctl(int semid, int semnum, int cmd, unsigned long arg asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops, unsigned nsops, const struct __kernel_timespec __user *timeout); +asmlinkage long sys_semtimedop_time32(int semid, struct sembuf __user *sops, + unsigned nsops, + const struct old_timespec32 __user *timeout); asmlinkage long sys_semop(int semid, struct sembuf __user *sops, unsigned nsops); @@ -871,6 +917,8 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd, int flags); asmlinkage long sys_clock_adjtime(clockid_t which_clock, struct __kernel_timex __user *tx); +asmlinkage long sys_clock_adjtime32(clockid_t which_clock, + struct old_timex32 __user *tx); asmlinkage long sys_syncfs(int fd); asmlinkage long sys_setns(int fd, int nstype); asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg, @@ -1006,6 +1054,7 @@ asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpgrp(void); asmlinkage long sys_pause(void); asmlinkage long sys_time(time_t __user *tloc); +asmlinkage long sys_time32(old_time32_t __user *tloc); #ifdef __ARCH_WANT_SYS_UTIME asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); @@ -1014,6 +1063,13 @@ asmlinkage long sys_utimes(char __user *filename, asmlinkage long sys_futimesat(int dfd, const char __user *filename, struct timeval __user *utimes); #endif +asmlinkage long sys_futimesat_time32(unsigned int dfd, + const char __user *filename, + struct old_timeval32 __user *t); +asmlinkage long sys_utime32(const char __user *filename, + struct old_utimbuf32 __user *t); +asmlinkage long sys_utimes_time32(const char __user *filename, + struct old_timeval32 __user *t); asmlinkage long sys_creat(const char __user *pathname, umode_t mode); asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user *dirent, @@ -1038,6 +1094,7 @@ asmlinkage long sys_fork(void); /* obsolete: kernel/time/time.c */ asmlinkage long sys_stime(time_t __user *tptr); +asmlinkage long sys_stime32(old_time32_t __user *tptr); /* obsolete: kernel/signal.c */ asmlinkage long sys_sigpending(old_sigset_t __user *uset); -- cgit v1.2.3 From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/syscalls.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..3072dbaa7869 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,7 @@ struct file_handle; struct sigaltstack; struct rseq; union bpf_attr; +struct io_uring_params; #include #include @@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __aio_sigset *sig); +asmlinkage long sys_io_uring_setup(u32 entries, + struct io_uring_params __user *p); +asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, + u32 min_complete, u32 flags, + const sigset_t __user *sig, size_t sigsz); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, -- cgit v1.2.3 From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 09:16:05 -0700 Subject: io_uring: add support for pre-mapped user IO buffers If we have fixed user buffers, we can map them into the kernel when we setup the io_uring. That avoids the need to do get_user_pages() for each and every IO. To utilize this feature, the application must call io_uring_register() after having setup an io_uring instance, passing in IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to an iovec array, and the nr_args should contain how many iovecs the application wishes to map. If successful, these buffers are now mapped into the kernel, eligible for IO. To use these fixed buffers, the application must use the IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len must point to somewhere inside the indexed buffer. The application may register buffers throughout the lifetime of the io_uring instance. It can call io_uring_register() with IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of buffers, and then register a new set. The application need not unregister buffers explicitly before shutting down the io_uring instance. It's perfectly valid to setup a larger buffer, and then sometimes only use parts of it for an IO. As long as the range is within the originally mapped region, it will work just fine. For now, buffers must not be file backed. If file backed buffers are passed in, the registration will fail with -1/EOPNOTSUPP. This restriction may be relaxed in the future. RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat arbitrary 1G per buffer size is also imposed. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3072dbaa7869..3681c05ac538 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries, asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, u32 min_complete, u32 flags, const sigset_t __user *sig, size_t sigsz); +asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, + void __user *arg, unsigned int nr_args); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, -- cgit v1.2.3 From 3eb39f47934f9d5a3027fe00d906a45fe3a15fad Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Nov 2018 00:51:56 +0100 Subject: signal: add pidfd_send_signal() syscall The kill() syscall operates on process identifiers (pid). After a process has exited its pid can be reused by another process. If a caller sends a signal to a reused pid it will end up signaling the wrong process. This issue has often surfaced and there has been a push to address this problem [1]. This patch uses file descriptors (fd) from proc/ as stable handles on struct pid. Even if a pid is recycled the handle will not change. The fd can be used to send signals to the process it refers to. Thus, the new syscall pidfd_send_signal() is introduced to solve this problem. Instead of pids it operates on process fds (pidfd). /* prototype and argument /* long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags); /* syscall number 424 */ The syscall number was chosen to be 424 to align with Arnd's rework in his y2038 to minimize merge conflicts (cf. [25]). In addition to the pidfd and signal argument it takes an additional siginfo_t and flags argument. If the siginfo_t argument is NULL then pidfd_send_signal() is equivalent to kill(, ). If it is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo(). The flags argument is added to allow for future extensions of this syscall. It currently needs to be passed as 0. Failing to do so will cause EINVAL. /* pidfd_send_signal() replaces multiple pid-based syscalls */ The pidfd_send_signal() syscall currently takes on the job of rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a positive pid is passed to kill(2). It will however be possible to also replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended. /* sending signals to threads (tid) and process groups (pgid) */ Specifically, the pidfd_send_signal() syscall does currently not operate on process groups or threads. This is left for future extensions. In order to extend the syscall to allow sending signal to threads and process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and PIDFD_TYPE_TID) should be added. This implies that the flags argument will determine what is signaled and not the file descriptor itself. Put in other words, grouping in this api is a property of the flags argument not a property of the file descriptor (cf. [13]). Clarification for this has been requested by Eric (cf. [19]). When appropriate extensions through the flags argument are added then pidfd_send_signal() can additionally replace the part of kill(2) which operates on process groups as well as the tgkill(2) and rt_tgsigqueueinfo(2) syscalls. How such an extension could be implemented has been very roughly sketched in [14], [15], and [16]. However, this should not be taken as a commitment to a particular implementation. There might be better ways to do it. Right now this is intentionally left out to keep this patchset as simple as possible (cf. [4]). /* naming */ The syscall had various names throughout iterations of this patchset: - procfd_signal() - procfd_send_signal() - taskfd_send_signal() In the last round of reviews it was pointed out that given that if the flags argument decides the scope of the signal instead of different types of fds it might make sense to either settle for "procfd_" or "pidfd_" as prefix. The community was willing to accept either (cf. [17] and [18]). Given that one developer expressed strong preference for the "pidfd_" prefix (cf. [13]) and with other developers less opinionated about the name we should settle for "pidfd_" to avoid further bikeshedding. The "_send_signal" suffix was chosen to reflect the fact that the syscall takes on the job of multiple syscalls. It is therefore intentional that the name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the fomer because it might imply that pidfd_send_signal() is a replacement for kill(2), and not the latter because it is a hassle to remember the correct spelling - especially for non-native speakers - and because it is not descriptive enough of what the syscall actually does. The name "pidfd_send_signal" makes it very clear that its job is to send signals. /* zombies */ Zombies can be signaled just as any other process. No special error will be reported since a zombie state is an unreliable state (cf. [3]). However, this can be added as an extension through the @flags argument if the need ever arises. /* cross-namespace signals */ The patch currently enforces that the signaler and signalee either are in the same pid namespace or that the signaler's pid namespace is an ancestor of the signalee's pid namespace. This is done for the sake of simplicity and because it is unclear to what values certain members of struct siginfo_t would need to be set to (cf. [5], [6]). /* compat syscalls */ It became clear that we would like to avoid adding compat syscalls (cf. [7]). The compat syscall handling is now done in kernel/signal.c itself by adding __copy_siginfo_from_user_generic() which lets us avoid compat syscalls (cf. [8]). It should be noted that the addition of __copy_siginfo_from_user_any() is caused by a bug in the original implementation of rt_sigqueueinfo(2) (cf. 12). With upcoming rework for syscall handling things might improve significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain any additional callers. /* testing */ This patch was tested on x64 and x86. /* userspace usage */ An asciinema recording for the basic functionality can be found under [9]. With this patch a process can be killed via: #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include static inline int do_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { #ifdef __NR_pidfd_send_signal return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); #else return -ENOSYS; #endif } int main(int argc, char *argv[]) { int fd, ret, saved_errno, sig; if (argc < 3) exit(EXIT_FAILURE); fd = open(argv[1], O_DIRECTORY | O_CLOEXEC); if (fd < 0) { printf("%s - Failed to open \"%s\"\n", strerror(errno), argv[1]); exit(EXIT_FAILURE); } sig = atoi(argv[2]); printf("Sending signal %d to process %s\n", sig, argv[1]); ret = do_pidfd_send_signal(fd, sig, NULL, 0); saved_errno = errno; close(fd); errno = saved_errno; if (ret < 0) { printf("%s - Failed to send signal %d to process %s\n", strerror(errno), sig, argv[1]); exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); } /* Q&A * Given that it seems the same questions get asked again by people who are * late to the party it makes sense to add a Q&A section to the commit * message so it's hopefully easier to avoid duplicate threads. * * For the sake of progress please consider these arguments settled unless * there is a new point that desperately needs to be addressed. Please make * sure to check the links to the threads in this commit message whether * this has not already been covered. */ Q-01: (Florian Weimer [20], Andrew Morton [21]) What happens when the target process has exited? A-01: Sending the signal will fail with ESRCH (cf. [22]). Q-02: (Andrew Morton [21]) Is the task_struct pinned by the fd? A-02: No. A reference to struct pid is kept. struct pid - as far as I understand - was created exactly for the reason to not require to pin struct task_struct (cf. [22]). Q-03: (Andrew Morton [21]) Does the entire procfs directory remain visible? Just one entry within it? A-03: The same thing that happens right now when you hold a file descriptor to /proc/ open (cf. [22]). Q-04: (Andrew Morton [21]) Does the pid remain reserved? A-04: No. This patchset guarantees a stable handle not that pids are not recycled (cf. [22]). Q-05: (Andrew Morton [21]) Do attempts to signal that fd return errors? A-05: See {Q,A}-01. Q-06: (Andrew Morton [22]) Is there a cleaner way of obtaining the fd? Another syscall perhaps. A-06: Userspace can already trivially retrieve file descriptors from procfs so this is something that we will need to support anyway. Hence, there's no immediate need to add another syscalls just to make pidfd_send_signal() not dependent on the presence of procfs. However, adding a syscalls to get such file descriptors is planned for a future patchset (cf. [22]). Q-07: (Andrew Morton [21] and others) This fd-for-a-process sounds like a handy thing and people may well think up other uses for it in the future, probably unrelated to signals. Are the code and the interface designed to permit such future applications? A-07: Yes (cf. [22]). Q-08: (Andrew Morton [21] and others) Now I think about it, why a new syscall? This thing is looking rather like an ioctl? A-08: This has been extensively discussed. It was agreed that a syscall is preferred for a variety or reasons. Here are just a few taken from prior threads. Syscalls are safer than ioctl()s especially when signaling to fds. Processes are a core kernel concept so a syscall seems more appropriate. The layout of the syscall with its four arguments would require the addition of a custom struct for the ioctl() thereby causing at least the same amount or even more complexity for userspace than a simple syscall. The new syscall will replace multiple other pid-based syscalls (see description above). The file-descriptors-for-processes concept introduced with this syscall will be extended with other syscalls in the future. See also [22], [23] and various other threads already linked in here. Q-09: (Florian Weimer [24]) What happens if you use the new interface with an O_PATH descriptor? A-09: pidfds opened as O_PATH fds cannot be used to send signals to a process (cf. [2]). Signaling processes through pidfds is the equivalent of writing to a file. Thus, this is not an operation that operates "purely at the file descriptor level" as required by the open(2) manpage. See also [4]. /* References */ [1]: https://lore.kernel.org/lkml/20181029221037.87724-1-dancol@google.com/ [2]: https://lore.kernel.org/lkml/874lbtjvtd.fsf@oldenburg2.str.redhat.com/ [3]: https://lore.kernel.org/lkml/20181204132604.aspfupwjgjx6fhva@brauner.io/ [4]: https://lore.kernel.org/lkml/20181203180224.fkvw4kajtbvru2ku@brauner.io/ [5]: https://lore.kernel.org/lkml/20181121213946.GA10795@mail.hallyn.com/ [6]: https://lore.kernel.org/lkml/20181120103111.etlqp7zop34v6nv4@brauner.io/ [7]: https://lore.kernel.org/lkml/36323361-90BD-41AF-AB5B-EE0D7BA02C21@amacapital.net/ [8]: https://lore.kernel.org/lkml/87tvjxp8pc.fsf@xmission.com/ [9]: https://asciinema.org/a/IQjuCHew6bnq1cr78yuMv16cy [11]: https://lore.kernel.org/lkml/F53D6D38-3521-4C20-9034-5AF447DF62FF@amacapital.net/ [12]: https://lore.kernel.org/lkml/87zhtjn8ck.fsf@xmission.com/ [13]: https://lore.kernel.org/lkml/871s6u9z6u.fsf@xmission.com/ [14]: https://lore.kernel.org/lkml/20181206231742.xxi4ghn24z4h2qki@brauner.io/ [15]: https://lore.kernel.org/lkml/20181207003124.GA11160@mail.hallyn.com/ [16]: https://lore.kernel.org/lkml/20181207015423.4miorx43l3qhppfz@brauner.io/ [17]: https://lore.kernel.org/lkml/CAGXu5jL8PciZAXvOvCeCU3wKUEB_dU-O3q0tDw4uB_ojMvDEew@mail.gmail.com/ [18]: https://lore.kernel.org/lkml/20181206222746.GB9224@mail.hallyn.com/ [19]: https://lore.kernel.org/lkml/20181208054059.19813-1-christian@brauner.io/ [20]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/ [21]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/ [22]: https://lore.kernel.org/lkml/20181228233725.722tdfgijxcssg76@brauner.io/ [23]: https://lwn.net/Articles/773459/ [24]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/ [25]: https://lore.kernel.org/lkml/CAK8P3a0ej9NcJM8wXNPbcGUyOUZYX+VLoDFdbenW3s3114oQZw@mail.gmail.com/ Cc: "Eric W. Biederman" Cc: Jann Horn Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Oleg Nesterov Cc: Al Viro Cc: Florian Weimer Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Reviewed-by: Kees Cook Reviewed-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Serge Hallyn Acked-by: Aleksa Sarai --- include/linux/syscalls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..5eb2e351675e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -926,6 +926,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, + siginfo_t __user *info, + unsigned int flags); /* * Architecture-specific system calls -- cgit v1.2.3