From 444ebb5768c5c43aadfc60111fecd6c4f946e77b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:32:43 +0300 Subject: splice: make do_splice public Make do_splice(), so other kernel parts can reuse it Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/splice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/splice.h b/include/linux/splice.h index 74b4911ac16d..ebbbfea48aa0 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, struct pipe_buffer *); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +extern long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags); /* * for dynamic pipe sizing -- cgit v1.2.3 From 7d67af2c013402537385dae343a2d0f6a4cb3bfd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:32:45 +0300 Subject: io_uring: add splice(2) support Add support for splice(2). - output file is specified as sqe->fd, so it's handled by generic code - hash_reg_file handled by generic code as well - len is 32bit, but should be fine - the fd_in is registered file, when SPLICE_F_FD_IN_FIXED is set, which is a splice flag (i.e. sqe->splice_flags). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 3f7961c1c243..08891cc1c1e7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -23,7 +23,10 @@ struct io_uring_sqe { __u64 off; /* offset into file */ __u64 addr2; }; - __u64 addr; /* pointer to buffer or iovecs */ + union { + __u64 addr; /* pointer to buffer or iovecs */ + __u64 splice_off_in; + }; __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; @@ -37,6 +40,7 @@ struct io_uring_sqe { __u32 open_flags; __u32 statx_flags; __u32 fadvise_advice; + __u32 splice_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -45,6 +49,7 @@ struct io_uring_sqe { __u16 buf_index; /* personality to use, if used */ __u16 personality; + __s32 splice_fd_in; }; __u64 __pad2[3]; }; @@ -113,6 +118,7 @@ enum { IORING_OP_RECV, IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, + IORING_OP_SPLICE, /* this goes last, obviously */ IORING_OP_LAST, @@ -128,6 +134,12 @@ enum { */ #define IORING_TIMEOUT_ABS (1U << 0) +/* + * sqe->splice_flags + * extends splice(2) flags + */ +#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From d7718a9d25a61442da8ee8aeeff6a0097f0ccfd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Feb 2020 22:23:12 -0700 Subject: io_uring: use poll driven retry for files that support it Currently io_uring tries any request in a non-blocking manner, if it can, and then retries from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry backend, use that to retry requests if the file supports it. This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking manner, we arm a poll handler for notification on when the socket becomes readable. When it does, the pending read is executed directly by the task again, through the io_uring task work handlers. Not only is this faster and more efficient, it also means we're not generating potentially tons of async threads that just sit and block, waiting for the IO to complete. The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast, and that pollother_op is fast as well. Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 103 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 104 insertions(+) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 27bd9e4f927b..9f0d3b7d56b0 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->force_nonblock, __entry->sq_thread) ); +TRACE_EVENT(io_uring_poll_arm, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), + + TP_ARGS(ctx, opcode, user_data, mask, events), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + __field( int, events ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + __entry->events = events; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask, __entry->events) +); + +TRACE_EVENT(io_uring_poll_wake, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_add, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask %x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_run, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data), + + TP_ARGS(ctx, opcode, user_data), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + ), + + TP_printk("ring %p, op %d, data 0x%llx", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 08891cc1c1e7..53b36311cdac 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -216,6 +216,7 @@ struct io_uring_params { #define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) +#define IORING_FEAT_FAST_POLL (1U << 5) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From ddf0322db79c5984dc1a1db890f946dd19b7d6d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 23 Feb 2020 16:41:33 -0700 Subject: io_uring: add IORING_OP_PROVIDE_BUFFERS IORING_OP_PROVIDE_BUFFERS uses the buffer registration infrastructure to support passing in an addr/len that is associated with a buffer ID and buffer group ID. The group ID is used to index and lookup the buffers, while the buffer ID can be used to notify the application which buffer in the group was used. The addr passed in is the starting buffer address, and length is each buffer length. A number of buffers to add with can be specified, in which case addr is incremented by length for each addition, and each buffer increments the buffer ID specified. No validation is done of the buffer ID. If the application provides buffers within the same group with identical buffer IDs, then it'll have a hard time telling which buffer ID was used. The only restriction is that the buffer ID can be a max of 16-bits in size, so USHRT_MAX is the maximum ID that can be used. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 53b36311cdac..bc34a57a660b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -45,8 +45,13 @@ struct io_uring_sqe { __u64 user_data; /* data to be passed back at completion time */ union { struct { - /* index into fixed buffers, if used */ - __u16 buf_index; + /* pack this to avoid bogus arm OABI complaints */ + union { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* for grouped buffer selection */ + __u16 buf_group; + } __attribute__((packed)); /* personality to use, if used */ __u16 personality; __s32 splice_fd_in; @@ -119,6 +124,7 @@ enum { IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, IORING_OP_SPLICE, + IORING_OP_PROVIDE_BUFFERS, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From bcda7baaa3f15c7a95db3c024bb046d6e298f76b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 23 Feb 2020 16:42:51 -0700 Subject: io_uring: support buffer selection for OP_READ and OP_RECV If a server process has tons of pending socket connections, generally it uses epoll to wait for activity. When the socket is ready for reading (or writing), the task can select a buffer and issue a recv/send on the given fd. Now that we have fast (non-async thread) support, a task can have tons of pending reads or writes pending. But that means they need buffers to back that data, and if the number of connections is high enough, having them preallocated for all possible connections is unfeasible. With IORING_OP_PROVIDE_BUFFERS, an application can register buffers to use for any request. The request then sets IOSQE_BUFFER_SELECT in the sqe, and a given group ID in sqe->buf_group. When the fd becomes ready, a free buffer from the specified group is selected. If none are available, the request is terminated with -ENOBUFS. If successful, the CQE on completion will contain the buffer ID chosen in the cqe->flags member, encoded as: (buffer_id << IORING_CQE_BUFFER_SHIFT) | IORING_CQE_F_BUFFER; Once a buffer has been consumed by a request, it is no longer available and must be registered again with IORING_OP_PROVIDE_BUFFERS. Requests need to support this feature. For now, IORING_OP_READ and IORING_OP_RECV support it. This is checked on SQE submission, a CQE with res == -EOPNOTSUPP will be posted if attempted on unsupported requests. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index bc34a57a660b..9b263d9b24e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -66,6 +66,7 @@ enum { IOSQE_IO_LINK_BIT, IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, + IOSQE_BUFFER_SELECT_BIT, }; /* @@ -81,6 +82,8 @@ enum { #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) /* always go async */ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) +/* select buffer from sqe->buf_group */ +#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) /* * io_uring_setup() flags @@ -155,6 +158,17 @@ struct io_uring_cqe { __u32 flags; }; +/* + * cqe->flags + * + * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + */ +#define IORING_CQE_F_BUFFER (1U << 0) + +enum { + IORING_CQE_BUFFER_SHIFT = 16, +}; + /* * Magic offsets for the application to mmap the data it needs */ -- cgit v1.2.3 From 0a384abfae66651b28e4bbe16883b1ff046ba3b3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Feb 2020 08:11:20 -0700 Subject: net: abstract out normal and compat msghdr import This splits it into two parts, one that imports the message, and one that imports the iovec. This allows a caller to only do the first part, and import the iovec manually afterwards. No functional changes in this patch. Acked-by: David Miller Signed-off-by: Jens Axboe --- include/linux/socket.h | 4 ++++ include/net/compat.h | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 2d2313403101..fc59ac825561 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); +extern int __copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec __user **uiov, size_t *nsegs); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/include/net/compat.h b/include/net/compat.h index f277653c7e17..e341260642fe 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -38,6 +38,9 @@ struct compat_cmsghdr { #define compat_mmsghdr mmsghdr #endif /* defined(CONFIG_COMPAT) */ +int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, compat_uptr_t *ptr, + compat_size_t *len); int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval); -- cgit v1.2.3 From 067524e914cb23e20d59480b318fe2625eaee7c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Mar 2020 16:32:28 -0700 Subject: io_uring: provide means of removing buffers We have IORING_OP_PROVIDE_BUFFERS, but the only way to remove buffers is to trigger IO on them. The usual case of shrinking a buffer pool would be to just not replenish the buffers when IO completes, and instead just free it. But it may be nice to have a way to manually remove a number of buffers from a given group, and IORING_OP_REMOVE_BUFFERS provides that functionality. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 9b263d9b24e6..cef4c0c0f26b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -128,6 +128,7 @@ enum { IORING_OP_EPOLL_CTL, IORING_OP_SPLICE, IORING_OP_PROVIDE_BUFFERS, + IORING_OP_REMOVE_BUFFERS, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From bbbdeb4720a0759ec90e3bcb20ad28d19e531346 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Mar 2020 07:45:46 -0600 Subject: io_uring: dual license io_uring.h uapi header This just syncs the header it with the liburing version, so there's no confusion on the license of the header parts. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cef4c0c0f26b..6d9d2b1cc523 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT */ /* * Header file for the io_uring interface. * -- cgit v1.2.3 From 9f5834c868e901b00f1bfe4d0052b5906b4a2b7f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sat, 21 Mar 2020 12:19:07 +0100 Subject: io_uring: make spdxcheck.py happy Commit bbbdeb4720a0 ("io_uring: dual license io_uring.h uapi header") uses a nested SPDX-License-Identifier to dual license the header. Since then, ./scripts/spdxcheck.py complains: include/uapi/linux/io_uring.h: 1:60 Missing parentheses: OR Add parentheses to make spdxcheck.py happy. Signed-off-by: Lukas Bulwahn Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6d9d2b1cc523..e48d746b8e2a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Header file for the io_uring interface. * -- cgit v1.2.3