summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-28 16:30:12 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-28 16:30:12 -0700
commitc3018a2c6adae9b32f7b9259f5b38257ba9a758e (patch)
tree3a91bbbc8ef5d0f81a60d9322d1fcefd4720a1c3
parente5cf61fa6e2fb9ae6339eaa892612488c966baaf (diff)
parentd9f595b9a65e9c9eb03e21f3db98fde158d128db (diff)
Merge tag 'for-6.17/io_uring-20250728' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Optimization to avoid reference counts on non-cloned registered buffers. This is how these buffers were handled prior to having cloning support, and we can still use that approach as long as the buffers haven't been cloned to another ring. - Cleanup and improvement for uring_cmd, where btrfs was the only user of storing allocated data for the lifetime of the uring_cmd. Clean that up so we can get rid of the need to do that. - Avoid unnecessary memory copies in uring_cmd usage. This is particularly important as a lot of uring_cmd usage necessitates the use of 128b SQEs. - A few updates for recv multishot, where it's now possible to add fairness limits for limiting how much is transferred for each retry loop. Additionally, recv multishot now supports an overall cap as well, where once reached the multishot recv will terminate. The latter is useful for buffer management and juggling many recv streams at the same time. - Add support for returning the TX timestamps via a new socket command. This feature can work in either singleshot or multishot mode, where the latter triggers a completion whenever new timestamps are available. This is an alternative to using the existing error queue. - Add support for an io_uring "mock" file, which is the start of being able to do 100% targeted testing in terms of exercising io_uring request handling. The idea is to have a file type that can be anything the tester would like, and behave exactly how you want it to behave in terms of hitting the code paths you want. - Improve zcrx by using sgtables to de-duplicate and improve dma address handling. - Prep work for supporting larger pages for zcrx. - Various little improvements and fixes. * tag 'for-6.17/io_uring-20250728' of git://git.kernel.dk/linux: (42 commits) io_uring/zcrx: fix leaking pages on sg init fail io_uring/zcrx: don't leak pages on account failure io_uring/zcrx: fix null ifq on area destruction io_uring: fix breakage in EXPERT menu io_uring/cmd: remove struct io_uring_cmd_data btrfs/ioctl: store btrfs_uring_encoded_data in io_btrfs_cmd io_uring/cmd: introduce IORING_URING_CMD_REISSUE flag io_uring/zcrx: account area memory io_uring: export io_[un]account_mem io_uring/net: Support multishot receive len cap io_uring: deduplicate wakeup handling io_uring/net: cast min_not_zero() type io_uring/poll: cleanup apoll freeing io_uring/net: allow multishot receive per-invocation cap io_uring/net: move io_sr_msg->retry_flags to io_sr_msg->flags io_uring/net: use passed in 'len' in io_recv_buf_select() io_uring/zcrx: prepare fallback for larger pages io_uring/zcrx: assert area type in io_zcrx_iov_page io_uring/zcrx: allocate sgtable for umem areas io_uring/zcrx: introduce io_populate_area_dma ...
-rw-r--r--MAINTAINERS1
-rw-r--r--fs/btrfs/ioctl.c38
-rw-r--r--include/linux/io_uring/cmd.h11
-rw-r--r--include/linux/io_uring_types.h5
-rw-r--r--include/net/sock.h4
-rw-r--r--include/uapi/linux/io_uring.h19
-rw-r--r--include/uapi/linux/io_uring/mock_file.h47
-rw-r--r--init/Kconfig13
-rw-r--r--io_uring/Makefile1
-rw-r--r--io_uring/cmd_net.c82
-rw-r--r--io_uring/io_uring.c90
-rw-r--r--io_uring/io_uring.h28
-rw-r--r--io_uring/mock_file.c363
-rw-r--r--io_uring/net.c79
-rw-r--r--io_uring/nop.c8
-rw-r--r--io_uring/opdef.c1
-rw-r--r--io_uring/opdef.h1
-rw-r--r--io_uring/poll.c44
-rw-r--r--io_uring/poll.h1
-rw-r--r--io_uring/rsrc.c10
-rw-r--r--io_uring/rsrc.h2
-rw-r--r--io_uring/rw.c2
-rw-r--r--io_uring/uring_cmd.c93
-rw-r--r--io_uring/uring_cmd.h9
-rw-r--r--io_uring/zcrx.c267
-rw-r--r--io_uring/zcrx.h2
-rw-r--r--net/socket.c46
27 files changed, 1029 insertions, 238 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 70cb485b2afa..6a918dd74974 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12687,6 +12687,7 @@ F: include/linux/io_uring.h
F: include/linux/io_uring_types.h
F: include/trace/events/io_uring.h
F: include/uapi/linux/io_uring.h
+F: include/uapi/linux/io_uring/
F: io_uring/
IPMI SUBSYSTEM
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 895009b147ac..7e13de2bdcbf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4607,6 +4607,13 @@ out_acct:
return ret;
}
+struct btrfs_uring_encoded_data {
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov;
+ struct iov_iter iter;
+};
+
/*
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
* contains the fields in btrfs_uring_read_extent that are necessary to finish
@@ -4628,6 +4635,7 @@ struct btrfs_uring_priv {
};
struct io_btrfs_cmd {
+ struct btrfs_uring_encoded_data *data;
struct btrfs_uring_priv *priv;
};
@@ -4686,6 +4694,7 @@ out:
kfree(priv->pages);
kfree(priv->iov);
kfree(priv);
+ kfree(bc->data);
}
void btrfs_uring_read_extent_endio(void *ctx, int err)
@@ -4769,13 +4778,6 @@ out_fail:
return ret;
}
-struct btrfs_uring_encoded_data {
- struct btrfs_ioctl_encoded_io_args args;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov;
- struct iov_iter iter;
-};
-
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
@@ -4791,7 +4793,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4821,7 +4827,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -4919,6 +4925,9 @@ out_acct:
add_rchar(current, ret);
inc_syscr(current);
+ if (ret != -EIOCBQUEUED && ret != -EAGAIN)
+ kfree(data);
+
return ret;
}
@@ -4929,7 +4938,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
struct file *file;
ssize_t ret;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4951,7 +4964,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -5041,6 +5054,9 @@ out_acct:
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
+
+ if (ret != -EAGAIN)
+ kfree(data);
return ret;
}
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 53408124c1e5..cfa6d0c0c322 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -8,6 +8,8 @@
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE (1U << 30)
+/* io_uring_cmd is being issued again */
+#define IORING_URING_CMD_REISSUE (1U << 31)
struct io_uring_cmd {
struct file *file;
@@ -19,10 +21,6 @@ struct io_uring_cmd {
u8 pdu[32]; /* available inline for free use */
};
-struct io_uring_cmd_data {
- void *op_data;
-};
-
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
{
return sqe->cmd;
@@ -135,11 +133,6 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd
return cmd_to_io_kiocb(cmd)->tctx->task;
}
-static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd)
-{
- return cmd_to_io_kiocb(cmd)->async_data;
-}
-
/*
* Return uring_cmd's context reference as its context handle for driver to
* track per-context resource, such as registered kernel IO buffer
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index a7efcec2e3d0..80a178f3d896 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -26,6 +26,8 @@ enum io_uring_cmd_flags {
IO_URING_F_MULTISHOT = 4,
/* executed by io-wq */
IO_URING_F_IOWQ = 8,
+ /* executed inline from syscall */
+ IO_URING_F_INLINE = 16,
/* int's last bit, sign checks are usually faster than a bit test */
IO_URING_F_NONBLOCK = INT_MIN,
@@ -502,6 +504,7 @@ enum {
REQ_F_BUF_NODE_BIT,
REQ_F_HAS_METADATA_BIT,
REQ_F_IMPORT_BUFFER_BIT,
+ REQ_F_SQE_COPIED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -591,6 +594,8 @@ enum {
* For SEND_ZC, whether to import buffers (i.e. the first issue).
*/
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
+ /* ->sqe_copy() has been called, if necessary */
+ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4c37015b7cf7..131f3662426f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2677,6 +2677,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+ struct timespec64 *ts);
+
static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cfd17e382082..b8a0e70ee2fd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -50,7 +50,7 @@ struct io_uring_sqe {
};
__u32 len; /* buffer size or number of iovecs */
union {
- __kernel_rwf_t rw_flags;
+ __u32 rw_flags;
__u32 fsync_flags;
__u16 poll_events; /* compatibility */
__u32 poll32_events; /* word-reversed for BE */
@@ -449,6 +449,7 @@ enum io_uring_msg_ring_flags {
#define IORING_NOP_FILE (1U << 1)
#define IORING_NOP_FIXED_FILE (1U << 2)
#define IORING_NOP_FIXED_BUFFER (1U << 3)
+#define IORING_NOP_TW (1U << 4)
/*
* IO completion data structure (Completion Queue Entry)
@@ -968,6 +969,22 @@ enum io_uring_socket_op {
SOCKET_URING_OP_SIOCOUTQ,
SOCKET_URING_OP_GETSOCKOPT,
SOCKET_URING_OP_SETSOCKOPT,
+ SOCKET_URING_OP_TX_TIMESTAMP,
+};
+
+/*
+ * SOCKET_URING_OP_TX_TIMESTAMP definitions
+ */
+
+#define IORING_TIMESTAMP_HW_SHIFT 16
+/* The cqe->flags bit from which the timestamp type is stored */
+#define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1)
+/* The cqe->flags flag signifying whether it's a hardware timestamp */
+#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT)
+
+struct io_timespec {
+ __u64 tv_sec;
+ __u64 tv_nsec;
};
/* Zero copy receive refill queue entry */
diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
new file mode 100644
index 000000000000..debeee8e4527
--- /dev/null
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -0,0 +1,47 @@
+#ifndef LINUX_IO_URING_MOCK_FILE_H
+#define LINUX_IO_URING_MOCK_FILE_H
+
+#include <linux/types.h>
+
+enum {
+ IORING_MOCK_FEAT_CMD_COPY,
+ IORING_MOCK_FEAT_RW_ZERO,
+ IORING_MOCK_FEAT_RW_NOWAIT,
+ IORING_MOCK_FEAT_RW_ASYNC,
+ IORING_MOCK_FEAT_POLL,
+
+ IORING_MOCK_FEAT_END,
+};
+
+struct io_uring_mock_probe {
+ __u64 features;
+ __u64 __resv[9];
+};
+
+enum {
+ IORING_MOCK_CREATE_F_SUPPORT_NOWAIT = 1,
+ IORING_MOCK_CREATE_F_POLL = 2,
+};
+
+struct io_uring_mock_create {
+ __u32 out_fd;
+ __u32 flags;
+ __u64 file_size;
+ __u64 rw_delay_ns;
+ __u64 __resv[13];
+};
+
+enum {
+ IORING_MOCK_MGR_CMD_PROBE,
+ IORING_MOCK_MGR_CMD_CREATE,
+};
+
+enum {
+ IORING_MOCK_CMD_COPY_REGBUF,
+};
+
+enum {
+ IORING_MOCK_COPY_FROM = 1,
+};
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 666783eb50ab..80bd36d31007 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1794,7 +1794,7 @@ config IO_URING
config GCOV_PROFILE_URING
bool "Enable GCOV profiling on the io_uring subsystem"
- depends on GCOV_KERNEL
+ depends on IO_URING && GCOV_KERNEL
help
Enable GCOV profiling on the io_uring subsystem, to facilitate
code coverage testing.
@@ -1805,6 +1805,17 @@ config GCOV_PROFILE_URING
the io_uring subsystem, hence this should only be enabled for
specific test purposes.
+config IO_URING_MOCK_FILE
+ tristate "Enable io_uring mock files (Experimental)" if EXPERT
+ default n
+ depends on IO_URING
+ help
+ Enable mock files for io_uring subststem testing. The ABI might
+ still change, so it's still experimental and should only be enabled
+ for specific test purposes.
+
+ If unsure, say N.
+
config ADVISE_SYSCALLS
bool "Enable madvise/fadvise syscalls" if EXPERT
default y
diff --git a/io_uring/Makefile b/io_uring/Makefile
index d97c6b51d584..b3f1bd492804 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
obj-$(CONFIG_NET) += net.o cmd_net.o
obj-$(CONFIG_PROC_FS) += fdinfo.o
+obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index e99170c7d41a..3866fe6ff541 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -1,5 +1,6 @@
#include <asm/ioctls.h>
#include <linux/io_uring/net.h>
+#include <linux/errqueue.h>
#include <net/sock.h>
#include "uring_cmd.h"
@@ -51,6 +52,85 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock,
optlen);
}
+static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk,
+ struct sk_buff *skb, unsigned issue_flags)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ struct io_uring_cqe cqe[2];
+ struct io_timespec *iots;
+ struct timespec64 ts;
+ u32 tstype, tskey;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec));
+
+ ret = skb_get_tx_timestamp(skb, sk, &ts);
+ if (ret < 0)
+ return false;
+
+ tskey = serr->ee.ee_data;
+ tstype = serr->ee.ee_info;
+
+ cqe->user_data = 0;
+ cqe->res = tskey;
+ cqe->flags = IORING_CQE_F_MORE;
+ cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT;
+ if (ret == SOF_TIMESTAMPING_TX_HARDWARE)
+ cqe->flags |= IORING_CQE_F_TSTAMP_HW;
+
+ iots = (struct io_timespec *)&cqe[1];
+ iots->tv_sec = ts.tv_sec;
+ iots->tv_nsec = ts.tv_nsec;
+ return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe);
+}
+
+static int io_uring_cmd_timestamp(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
+ int ret;
+
+ if (!(issue_flags & IO_URING_F_CQE32))
+ return -EINVAL;
+ ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR);
+ if (unlikely(ret))
+ return ret;
+
+ if (skb_queue_empty_lockless(q))
+ return -EAGAIN;
+ __skb_queue_head_init(&list);
+
+ scoped_guard(spinlock_irq, &q->lock) {
+ skb_queue_walk_safe(q, skb, tmp) {
+ /* don't support skbs with payload */
+ if (!skb_has_tx_timestamp(skb, sk) || skb->len)
+ continue;
+ __skb_unlink(skb, q);
+ __skb_queue_tail(&list, skb);
+ }
+ }
+
+ while (1) {
+ skb = skb_peek(&list);
+ if (!skb)
+ break;
+ if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags))
+ break;
+ __skb_dequeue(&list);
+ consume_skb(skb);
+ }
+
+ if (!unlikely(skb_queue_empty(&list))) {
+ scoped_guard(spinlock_irqsave, &q->lock)
+ skb_queue_splice(q, &list);
+ }
+ return -EAGAIN;
+}
+
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
@@ -76,6 +156,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
case SOCKET_URING_OP_SETSOCKOPT:
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_TX_TIMESTAMP:
+ return io_uring_cmd_timestamp(sock, cmd, issue_flags);
default:
return -EOPNOTSUPP;
}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5111ec040c53..4ef69dd58734 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -114,11 +114,11 @@
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
- REQ_F_ASYNC_DATA)
+ REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \
- REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
+ REQ_F_REISSUE | REQ_F_POLLED | \
+ IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -147,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all,
bool is_sqpoll_thread);
-static void io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags);
static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -392,11 +392,6 @@ static void io_clean_op(struct io_kiocb *req)
if (def->cleanup)
def->cleanup(req);
}
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- kfree(req->apoll->double_poll);
- kfree(req->apoll);
- req->apoll = NULL;
- }
if (req->flags & REQ_F_INFLIGHT)
atomic_dec(&req->tctx->inflight_tracked);
if (req->flags & REQ_F_CREDS)
@@ -793,6 +788,21 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
return true;
}
+static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
+ struct io_uring_cqe src_cqe[2])
+{
+ struct io_uring_cqe *cqe;
+
+ if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ return false;
+ if (unlikely(!io_get_cqe(ctx, &cqe)))
+ return false;
+
+ memcpy(cqe, src_cqe, 2 * sizeof(*cqe));
+ trace_io_uring_complete(ctx, NULL, cqe);
+ return true;
+}
+
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
@@ -904,6 +914,31 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
return posted;
}
+/*
+ * A helper for multishot requests posting additional CQEs.
+ * Should only be used from a task_work including IO_URING_F_MULTISHOT.
+ */
+bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2])
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ lockdep_assert(!io_wq_current_is_worker());
+ lockdep_assert_held(&ctx->uring_lock);
+
+ cqe[0].user_data = req->cqe.user_data;
+ if (!ctx->lockless_cq) {
+ spin_lock(&ctx->completion_lock);
+ posted = io_fill_cqe_aux32(ctx, cqe);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ posted = io_fill_cqe_aux32(ctx, cqe);
+ }
+
+ ctx->submit_state.cq_flush = true;
+ return posted;
+}
+
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1377,7 +1412,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
else if (req->flags & REQ_F_FORCE_ASYNC)
io_queue_iowq(req);
else
- io_queue_sqe(req);
+ io_queue_sqe(req, 0);
}
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1938,14 +1973,34 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
return file;
}
-static void io_queue_async(struct io_kiocb *req, int ret)
+static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags)
+{
+ const struct io_cold_def *def = &io_cold_defs[req->opcode];
+
+ if (req->flags & REQ_F_SQE_COPIED)
+ return 0;
+ req->flags |= REQ_F_SQE_COPIED;
+ if (!def->sqe_copy)
+ return 0;
+ if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE)))
+ return -EFAULT;
+ def->sqe_copy(req);
+ return 0;
+}
+
+static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret)
__must_hold(&req->ctx->uring_lock)
{
if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
+fail:
io_req_defer_failed(req, ret);
return;
}
+ ret = io_req_sqe_copy(req, issue_flags);
+ if (unlikely(ret))
+ goto fail;
+
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_kbuf_recycle(req, 0);
@@ -1960,19 +2015,21 @@ static void io_queue_async(struct io_kiocb *req, int ret)
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags)
__must_hold(&req->ctx->uring_lock)
{
+ unsigned int issue_flags = IO_URING_F_NONBLOCK |
+ IO_URING_F_COMPLETE_DEFER | extra_flags;
int ret;
- ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ ret = io_issue_sqe(req, issue_flags);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (unlikely(ret))
- io_queue_async(req, ret);
+ io_queue_async(req, issue_flags, ret);
}
static void io_queue_sqe_fallback(struct io_kiocb *req)
@@ -1987,6 +2044,8 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
req->flags |= REQ_F_LINK;
io_req_defer_failed(req, req->cqe.res);
} else {
+ /* can't fail with IO_URING_F_INLINE */
+ io_req_sqe_copy(req, IO_URING_F_INLINE);
if (unlikely(req->ctx->drain_active))
io_drain_req(req);
else
@@ -2198,6 +2257,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
*/
if (unlikely(link->head)) {
trace_io_uring_link(req, link->last);
+ io_req_sqe_copy(req, IO_URING_F_INLINE);
link->last->link = req;
link->last = req;
@@ -2221,7 +2281,7 @@ fallback:
return 0;
}
- io_queue_sqe(req);
+ io_queue_sqe(req, IO_URING_F_INLINE);
return 0;
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 66c1ca73f55e..abc6de227f74 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -81,6 +81,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
+bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
void io_req_track_inflight(struct io_kiocb *req);
@@ -293,11 +294,22 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static inline void __io_wq_wake(struct wait_queue_head *wq)
+{
+ /*
+ *
+ * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
+ * set in the mask so that if we recurse back into our own poll
+ * waitqueue handlers, we know we have a dependency between eventfd or
+ * epoll and should terminate multishot poll at that point.
+ */
+ if (wq_has_sleeper(wq))
+ __wake_up(wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+}
+
static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
{
- if (wq_has_sleeper(&ctx->poll_wq))
- __wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
- poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+ __io_wq_wake(&ctx->poll_wq);
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
@@ -306,15 +318,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
* Trigger waitqueue handler on all waiters on our waitqueue. This
* won't necessarily wake up all the tasks, io_should_wake() will make
* that decision.
- *
- * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
- * set in the mask so that if we recurse back into our own poll
- * waitqueue handlers, we know we have a dependency between eventfd or
- * epoll and should terminate multishot poll at that point.
*/
- if (wq_has_sleeper(&ctx->cq_wait))
- __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
- poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+
+ __io_wq_wake(&ctx->cq_wait);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
new file mode 100644
index 000000000000..45d3735b2708
--- /dev/null
+++ b/io_uring/mock_file.c
@@ -0,0 +1,363 @@
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/anon_inodes.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/poll.h>
+
+#include <linux/io_uring/cmd.h>
+#include <linux/io_uring_types.h>
+#include <uapi/linux/io_uring/mock_file.h>
+
+struct io_mock_iocb {
+ struct kiocb *iocb;
+ struct hrtimer timer;
+ int res;
+};
+
+struct io_mock_file {
+ size_t size;
+ u64 rw_delay_ns;
+ bool pollable;
+ struct wait_queue_head poll_wq;
+};
+
+#define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM
+
+static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf)
+{
+ size_t ret, copied = 0;
+ size_t buflen = PAGE_SIZE;
+ void *tmp_buf;
+
+ tmp_buf = kzalloc(buflen, GFP_KERNEL);
+ if (!tmp_buf)
+ return -ENOMEM;
+
+ while (iov_iter_count(reg_iter)) {
+ size_t len = min(iov_iter_count(reg_iter), buflen);
+
+ if (iov_iter_rw(reg_iter) == ITER_SOURCE) {
+ ret = copy_from_iter(tmp_buf, len, reg_iter);
+ if (ret <= 0)
+ break;
+ if (copy_to_user(ubuf, tmp_buf, ret))
+ break;
+ } else {
+ if (copy_from_user(tmp_buf, ubuf, len))
+ break;
+ ret = copy_to_iter(tmp_buf, len, reg_iter);
+ if (ret <= 0)
+ break;
+ }
+ ubuf += ret;
+ copied += ret;
+ }
+
+ kfree(tmp_buf);
+ return copied;
+}
+
+static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ const struct iovec __user *iovec;
+ unsigned flags, iovec_len;
+ struct iov_iter iter;
+ void __user *ubuf;
+ int dir, ret;
+
+ ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+ iovec = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ iovec_len = READ_ONCE(sqe->len);
+ flags = READ_ONCE(sqe->file_index);
+
+ if (unlikely(sqe->ioprio || sqe->__pad1))
+ return -EINVAL;
+ if (flags & ~IO_VALID_COPY_CMD_FLAGS)
+ return -EINVAL;
+
+ dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST;
+ ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter,
+ issue_flags);
+ if (ret)
+ return ret;
+ ret = io_copy_regbuf(&iter, ubuf);
+ return ret ? ret : -EFAULT;
+}
+
+static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ switch (cmd->cmd_op) {
+ case IORING_MOCK_CMD_COPY_REGBUF:
+ return io_cmd_copy_regbuf(cmd, issue_flags);
+ }
+ return -ENOTSUPP;
+}
+
+static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer)
+{
+ struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer);
+ struct kiocb *iocb = mio->iocb;
+
+ WRITE_ONCE(iocb->private, NULL);
+ iocb->ki_complete(iocb, mio->res);
+ kfree(mio);
+ return HRTIMER_NORESTART;
+}
+
+static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len)
+{
+ struct io_mock_file *mf = iocb->ki_filp->private_data;
+ struct io_mock_iocb *mio;
+
+ mio = kzalloc(sizeof(*mio), GFP_KERNEL);
+ if (!mio)
+ return -ENOMEM;
+
+ mio->iocb = iocb;
+ mio->res = len;
+ hrtimer_setup(&mio->timer, io_mock_rw_timer_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns),
+ HRTIMER_MODE_REL);
+ return -EIOCBQUEUED;
+}
+
+static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct io_mock_file *mf = iocb->ki_filp->private_data;
+ size_t len = iov_iter_count(to);
+ size_t nr_zeroed;
+
+ if (iocb->ki_pos + len > mf->size)
+ return -EINVAL;
+ nr_zeroed = iov_iter_zero(len, to);
+ if (!mf->rw_delay_ns || nr_zeroed != len)
+ return nr_zeroed;
+
+ return io_mock_delay_rw(iocb, len);
+}
+
+static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct io_mock_file *mf = iocb->ki_filp->private_data;
+ size_t len = iov_iter_count(from);
+
+ if (iocb->ki_pos + len > mf->size)
+ return -EINVAL;
+ if (!mf->rw_delay_ns) {
+ iov_iter_advance(from, len);
+ return len;
+ }
+
+ return io_mock_delay_rw(iocb, len);
+}
+
+static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct io_mock_file *mf = file->private_data;
+
+ return fixed_size_llseek(file, offset, whence, mf->size);
+}
+
+static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt)
+{
+ struct io_mock_file *mf = file->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(file, &mf->poll_wq, pt);
+
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
+ return mask;
+}
+
+static int io_mock_release(struct inode *inode, struct file *file)
+{
+ struct io_mock_file *mf = file->private_data;
+
+ kfree(mf);
+ return 0;
+}
+
+static const struct file_operations io_mock_fops = {
+ .owner = THIS_MODULE,
+ .release = io_mock_release,
+ .uring_cmd = io_mock_cmd,
+ .read_iter = io_mock_read_iter,
+ .write_iter = io_mock_write_iter,
+ .llseek = io_mock_llseek,
+};
+
+static const struct file_operations io_mock_poll_fops = {
+ .owner = THIS_MODULE,
+ .release = io_mock_release,
+ .uring_cmd = io_mock_cmd,
+ .read_iter = io_mock_read_iter,
+ .write_iter = io_mock_write_iter,
+ .llseek = io_mock_llseek,
+ .poll = io_mock_poll,
+};
+
+#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \
+ IORING_MOCK_CREATE_F_POLL)
+
+static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ const struct file_operations *fops = &io_mock_fops;
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ struct io_uring_mock_create mc, __user *uarg;
+ struct io_mock_file *mf = NULL;
+ struct file *file = NULL;
+ size_t uarg_size;
+ int fd = -1, ret;
+
+ /*
+ * It's a testing only driver that allows exercising edge cases
+ * that wouldn't be possible to hit otherwise.
+ */
+ add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
+
+ uarg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ uarg_size = READ_ONCE(sqe->len);
+
+ if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index)
+ return -EINVAL;
+ if (uarg_size != sizeof(mc))
+ return -EINVAL;
+
+ memset(&mc, 0, sizeof(mc));
+ if (copy_from_user(&mc, uarg, uarg_size))
+ return -EFAULT;
+ if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)))
+ return -EINVAL;
+ if (mc.flags & ~IO_VALID_CREATE_FLAGS)
+ return -EINVAL;
+ if (mc.file_size > SZ_1G)
+ return -EINVAL;
+ if (mc.rw_delay_ns > NSEC_PER_SEC)
+ return -EINVAL;
+
+ mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT);
+ if (!mf)
+ return -ENOMEM;
+
+ ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ goto fail;
+
+ init_waitqueue_head(&mf->poll_wq);
+ mf->size = mc.file_size;
+ mf->rw_delay_ns = mc.rw_delay_ns;
+ if (mc.flags & IORING_MOCK_CREATE_F_POLL) {
+ fops = &io_mock_poll_fops;
+ mf->pollable = true;
+ }
+
+ file = anon_inode_create_getfile("[io_uring_mock]", fops,
+ mf, O_RDWR | O_CLOEXEC, NULL);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto fail;
+ }
+
+ file->f_mode |= FMODE_READ | FMODE_CAN_READ |
+ FMODE_WRITE | FMODE_CAN_WRITE |
+ FMODE_LSEEK;
+ if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT)
+ file->f_mode |= FMODE_NOWAIT;
+
+ mc.out_fd = fd;
+ if (copy_to_user(uarg, &mc, uarg_size)) {
+ fput(file);
+ ret = -EFAULT;
+ goto fail;
+ }
+
+ fd_install(fd, file);
+ return 0;
+fail:
+ if (fd >= 0)
+ put_unused_fd(fd);
+ kfree(mf);
+ return ret;
+}
+
+static int io_probe_mock(struct io_uring_cmd *cmd)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ struct io_uring_mock_probe mp, __user *uarg;
+ size_t uarg_size;
+
+ uarg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ uarg_size = READ_ONCE(sqe->len);
+
+ if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index ||
+ uarg_size != sizeof(mp))
+ return -EINVAL;
+
+ memset(&mp, 0, sizeof(mp));
+ if (copy_from_user(&mp, uarg, uarg_size))
+ return -EFAULT;
+ if (!mem_is_zero(&mp, sizeof(mp)))
+ return -EINVAL;
+
+ mp.features = IORING_MOCK_FEAT_END;
+
+ if (copy_to_user(uarg, &mp, uarg_size))
+ return -EFAULT;
+ return 0;
+}
+
+static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd->cmd_op) {
+ case IORING_MOCK_MGR_CMD_PROBE:
+ return io_probe_mock(cmd);
+ case IORING_MOCK_MGR_CMD_CREATE:
+ return io_create_mock_file(cmd, issue_flags);
+ }
+ return -EOPNOTSUPP;
+}
+
+static const struct file_operations iou_mock_dev_fops = {
+ .owner = THIS_MODULE,
+ .uring_cmd = iou_mock_mgr_cmd,
+};
+
+static struct miscdevice iou_mock_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "io_uring_mock",
+ .fops = &iou_mock_dev_fops,
+};
+
+static int __init io_mock_init(void)
+{
+ int ret;
+
+ ret = misc_register(&iou_mock_miscdev);
+ if (ret < 0) {
+ pr_err("Could not initialize io_uring mock device\n");
+ return ret;
+ }
+ return 0;
+}
+
+static void __exit io_mock_exit(void)
+{
+ misc_deregister(&iou_mock_miscdev);
+}
+
+module_init(io_mock_init)
+module_exit(io_mock_exit)
+
+MODULE_AUTHOR("Pavel Begunkov <asml.silence@gmail.com>");
+MODULE_DESCRIPTION("io_uring mock file");
+MODULE_LICENSE("GPL");
diff --git a/io_uring/net.c b/io_uring/net.c
index bec8c6ed0a93..35585bdc59f3 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -75,15 +75,29 @@ struct io_sr_msg {
u16 flags;
/* initialised and used only by !msg send variants */
u16 buf_group;
- unsigned short retry_flags;
+ /* per-invocation mshot limit */
+ unsigned mshot_len;
+ /* overall mshot byte limit */
+ unsigned mshot_total_len;
void __user *msg_control;
/* used only for send zerocopy */
struct io_kiocb *notif;
};
+/*
+ * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
+ * anyway. Use the upper 8 bits for internal uses.
+ */
enum sr_retry_flags {
- IO_SR_MSG_RETRY = 1,
- IO_SR_MSG_PARTIAL_MAP = 2,
+ IORING_RECV_RETRY = (1U << 15),
+ IORING_RECV_PARTIAL_MAP = (1U << 14),
+ IORING_RECV_MSHOT_CAP = (1U << 13),
+ IORING_RECV_MSHOT_LIM = (1U << 12),
+ IORING_RECV_MSHOT_DONE = (1U << 11),
+
+ IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
+ IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
+ IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
};
/*
@@ -192,8 +206,8 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
req->flags &= ~REQ_F_BL_EMPTY;
sr->done_io = 0;
- sr->retry_flags = 0;
- sr->len = 0; /* get from the provided buffer */
+ sr->flags &= ~IORING_RECV_RETRY_CLEAR;
+ sr->len = sr->mshot_len;
}
static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@@ -402,7 +416,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry_flags = 0;
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~SENDMSG_FLAGS)
@@ -756,9 +769,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry_flags = 0;
- if (unlikely(sqe->file_index || sqe->addr2))
+ if (unlikely(sqe->addr2))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -783,15 +795,25 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->buf_group = req->buf_index;
req->buf_list = NULL;
}
+ sr->mshot_total_len = sr->mshot_len = 0;
if (sr->flags & IORING_RECV_MULTISHOT) {
if (!(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (sr->msg_flags & MSG_WAITALL)
return -EINVAL;
- if (req->opcode == IORING_OP_RECV && sr->len)
+ if (req->opcode == IORING_OP_RECV) {
+ sr->mshot_len = sr->len;
+ sr->mshot_total_len = READ_ONCE(sqe->optlen);
+ if (sr->mshot_total_len)
+ sr->flags |= IORING_RECV_MSHOT_LIM;
+ } else if (sqe->optlen) {
return -EINVAL;
+ }
req->flags |= REQ_F_APOLL_MULTISHOT;
+ } else if (sqe->optlen) {
+ return -EINVAL;
}
+
if (sr->flags & IORING_RECVSEND_BUNDLE) {
if (req->opcode == IORING_OP_RECVMSG)
return -EINVAL;
@@ -823,13 +845,28 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
+ /*
+ * If sr->len hits zero, the limit has been reached. Mark
+ * mshot as finished, and flag MSHOT_DONE as well to prevent
+ * a potential bundle from being retried.
+ */
+ sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len);
+ if (!sr->mshot_total_len) {
+ sr->flags |= IORING_RECV_MSHOT_DONE;
+ mshot_finished = true;
+ }
+ }
+
if (sr->flags & IORING_RECVSEND_BUNDLE) {
size_t this_ret = *ret - sr->done_io;
cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret),
issue_flags);
- if (sr->retry_flags & IO_SR_MSG_RETRY)
+ if (sr->flags & IORING_RECV_RETRY)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
+ if (sr->mshot_len && *ret >= sr->mshot_len)
+ sr->flags |= IORING_RECV_MSHOT_CAP;
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
goto finish;
@@ -837,12 +874,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
* If more is available AND it was a full transfer, retry and
* append to this one
*/
- if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
+ if (!(sr->flags & IORING_RECV_NO_RETRY) &&
+ kmsg->msg.msg_inq > 1 && this_ret > 0 &&
!iov_iter_count(&kmsg->msg.msg_iter)) {
req->cqe.flags = cflags & ~CQE_F_MASK;
sr->len = kmsg->msg.msg_inq;
sr->done_io += this_ret;
- sr->retry_flags |= IO_SR_MSG_RETRY;
+ sr->flags |= IORING_RECV_RETRY;
return false;
}
} else {
@@ -859,10 +897,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
- if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
+ if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
+ !(sr->flags & IORING_RECV_MSHOT_CAP)) {
return false;
+ }
/* mshot retries exceeded, force a requeue */
sr->nr_multishot_loops = 0;
+ sr->flags &= ~IORING_RECV_MSHOT_CAP;
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_REQUEUE;
}
@@ -1075,9 +1116,14 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
arg.mode |= KBUF_MODE_FREE;
}
- if (kmsg->msg.msg_inq > 1)
- arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
+ if (*len)
+ arg.max_len = *len;
+ else if (kmsg->msg.msg_inq > 1)
+ arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq);
+ /* if mshot limited, ensure we don't go over */
+ if (sr->flags & IORING_RECV_MSHOT_LIM)
+ arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
ret = io_buffers_peek(req, &arg);
if (unlikely(ret < 0))
return ret;
@@ -1088,7 +1134,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
req->flags |= REQ_F_NEED_CLEANUP;
}
if (arg.partial_map)
- sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP;
+ sr->flags |= IORING_RECV_PARTIAL_MAP;
/* special case 1 vec, can be a fast path */
if (ret == 1) {
@@ -1283,7 +1329,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
zc->done_io = 0;
- zc->retry_flags = 0;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 6ac2de761fd3..20ed0f85b1c2 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -20,7 +20,8 @@ struct io_nop {
};
#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
- IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE)
+ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \
+ IORING_NOP_TW)
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -68,5 +69,10 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, nop->result, 0);
+ if (nop->flags & IORING_NOP_TW) {
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
return IOU_COMPLETE;
}
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 6de6229207a8..9568785810d9 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -760,6 +760,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
+ .sqe_copy = io_uring_cmd_sqe_copy,
.cleanup = io_uring_cmd_cleanup,
},
[IORING_OP_SEND_ZC] = {
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 719a52104abe..c2f0907ed78c 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -38,6 +38,7 @@ struct io_issue_def {
struct io_cold_def {
const char *name;
+ void (*sqe_copy)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *);
void (*fail)(struct io_kiocb *);
};
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 20e9b46a4adf..c786e587563b 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -667,33 +667,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
return apoll;
}
-int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
+int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask)
{
- const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t mask = POLLPRI | POLLERR | EPOLLET;
int ret;
- if (!def->pollin && !def->pollout)
- return IO_APOLL_ABORTED;
+ mask |= EPOLLET;
if (!io_file_can_poll(req))
return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
- if (def->pollin) {
- mask |= EPOLLIN | EPOLLRDNORM;
-
- /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
- if (req->flags & REQ_F_CLEAR_POLLIN)
- mask &= ~EPOLLIN;
- } else {
- mask |= EPOLLOUT | EPOLLWRNORM;
- }
- if (def->poll_exclusive)
- mask |= EPOLLEXCLUSIVE;
-
apoll = io_req_alloc_apoll(req, issue_flags);
if (!apoll)
return IO_APOLL_ABORTED;
@@ -710,6 +695,31 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_OK;
}
+int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
+{
+ const struct io_issue_def *def = &io_issue_defs[req->opcode];
+ __poll_t mask = POLLPRI | POLLERR;
+
+ if (!def->pollin && !def->pollout)
+ return IO_APOLL_ABORTED;
+ if (!io_file_can_poll(req))
+ return IO_APOLL_ABORTED;
+
+ if (def->pollin) {
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
+ if (req->flags & REQ_F_CLEAR_POLLIN)
+ mask &= ~EPOLLIN;
+ } else {
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ }
+ if (def->poll_exclusive)
+ mask |= EPOLLEXCLUSIVE;
+
+ return io_arm_apoll(req, issue_flags, mask);
+}
+
/*
* Returns true if we found and killed one or more poll requests
*/
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 27e2db2ed4ae..c8438286dfa0 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -41,6 +41,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
struct io_cancel_data;
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned issue_flags);
+int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask);
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f2b31fb68992..f75f5e43fa4a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -55,7 +55,7 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
return 0;
}
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
if (ctx->user)
__io_unaccount_mem(ctx->user, nr_pages);
@@ -64,7 +64,7 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
@@ -138,8 +138,10 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
{
- if (!refcount_dec_and_test(&imu->refs))
- return;
+ if (unlikely(refcount_read(&imu->refs) > 1)) {
+ if (!refcount_dec_and_test(&imu->refs))
+ return;
+ }
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 25e7e998dcfd..a3ca6ba66596 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -120,6 +120,8 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
+int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages);
+void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages);
static inline void __io_unaccount_mem(struct user_struct *user,
unsigned long nr_pages)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 710d8cd53ebb..52a5b950b2e5 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -288,7 +288,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
- rw->flags = READ_ONCE(sqe->rw_flags);
+ rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags);
attr_type_mask = READ_ONCE(sqe->attr_type_mask);
if (attr_type_mask) {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 929cad6ee326..053bac89b6c0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -12,6 +12,7 @@
#include "alloc_cache.h"
#include "rsrc.h"
#include "uring_cmd.h"
+#include "poll.h"
void io_cmd_cache_free(const void *entry)
{
@@ -25,12 +26,6 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
struct io_async_cmd *ac = req->async_data;
- struct io_uring_cmd_data *cache = &ac->data;
-
- if (cache->op_data) {
- kfree(cache->op_data);
- cache->op_data = NULL;
- }
if (issue_flags & IO_URING_F_UNLOCKED)
return;
@@ -39,7 +34,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&ac->vec);
- if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) {
+ if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) {
ioucmd->sqe = NULL;
req->async_data = NULL;
req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
@@ -136,6 +131,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
+ return;
+
ioucmd->task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
__io_req_task_work_add(req, flags);
@@ -158,6 +156,9 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
+ return;
+
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
if (ret < 0)
@@ -181,35 +182,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
-static int io_uring_cmd_prep_setup(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- struct io_async_cmd *ac;
-
- /* see io_uring_cmd_get_async_data() */
- BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0);
-
- ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
- if (!ac)
- return -ENOMEM;
- ac->data.op_data = NULL;
-
- /*
- * Unconditionally cache the SQE for now - this is only needed for
- * requests that go async, but prep handlers must ensure that any
- * sqe data is stable beyond prep. Since uring_cmd is special in
- * that it doesn't read in per-op data, play it safe and ensure that
- * any SQE data is stable beyond prep. This can later get relaxed.
- */
- memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx));
- ioucmd->sqe = ac->sqes;
- return 0;
-}
-
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ struct io_async_cmd *ac;
if (sqe->__pad1)
return -EINVAL;
@@ -223,7 +199,23 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
- return io_uring_cmd_prep_setup(req, sqe);
+ ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
+ if (!ac)
+ return -ENOMEM;
+ ioucmd->sqe = sqe;
+ return 0;
+}
+
+void io_uring_cmd_sqe_copy(struct io_kiocb *req)
+{
+ struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ struct io_async_cmd *ac = req->async_data;
+
+ /* Should not happen, as REQ_F_SQE_COPIED covers this */
+ if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
+ return;
+ memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
+ ioucmd->sqe = ac->sqes;
}
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
@@ -259,7 +251,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
- if (ret == -EAGAIN || ret == -EIOCBQUEUED)
+ if (ret == -EAGAIN) {
+ ioucmd->flags |= IORING_URING_CMD_REISSUE;
+ return ret;
+ }
+ if (ret == -EIOCBQUEUED)
return ret;
if (ret < 0)
req_set_fail(req);
@@ -310,3 +306,30 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
io_req_queue_iowq(req);
}
+
+int io_cmd_poll_multishot(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, __poll_t mask)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ int ret;
+
+ if (likely(req->flags & REQ_F_APOLL_MULTISHOT))
+ return 0;
+
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ mask &= ~EPOLLONESHOT;
+
+ ret = io_arm_apoll(req, issue_flags, mask);
+ return ret == IO_APOLL_OK ? -EIOCBQUEUED : -ECANCELED;
+}
+
+bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct io_uring_cqe cqe[2])
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+
+ if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_MULTISHOT)))
+ return false;
+ return io_req_post_cqe32(req, cqe);
+}
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index e6a5142c890e..041aef8a8aa3 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -4,16 +4,23 @@
#include <linux/io_uring_types.h>
struct io_async_cmd {
- struct io_uring_cmd_data data;
struct iou_vec vec;
struct io_uring_sqe sqes[2];
};
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+void io_uring_cmd_sqe_copy(struct io_kiocb *req);
void io_uring_cmd_cleanup(struct io_kiocb *req);
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all);
+bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct io_uring_cqe cqe[2]);
+
void io_cmd_cache_free(const void *entry);
+
+int io_cmd_poll_multishot(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, __poll_t mask);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 4a7011c799f0..e5ff49f3425e 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -44,9 +44,40 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+ lockdep_assert(!area->mem.is_dmabuf);
+
return area->mem.pages[net_iov_idx(niov)];
}
+static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area,
+ struct sg_table *sgt, unsigned long off)
+{
+ struct scatterlist *sg;
+ unsigned i, niov_idx = 0;
+
+ for_each_sgtable_dma_sg(sgt, sg, i) {
+ dma_addr_t dma = sg_dma_address(sg);
+ unsigned long sg_len = sg_dma_len(sg);
+ unsigned long sg_off = min(sg_len, off);
+
+ off -= sg_off;
+ sg_len -= sg_off;
+ dma += sg_off;
+
+ while (sg_len && niov_idx < area->nia.num_niovs) {
+ struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+ if (net_mp_niov_set_dma_addr(niov, dma))
+ return -EFAULT;
+ sg_len -= PAGE_SIZE;
+ dma += PAGE_SIZE;
+ niov_idx++;
+ }
+ }
+ return 0;
+}
+
static void io_release_dmabuf(struct io_zcrx_mem *mem)
{
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
@@ -123,33 +154,27 @@ err:
static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
- unsigned long off = area->mem.dmabuf_offset;
- struct scatterlist *sg;
- unsigned i, niov_idx = 0;
-
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return -EINVAL;
+ return io_populate_area_dma(ifq, area, area->mem.sgt,
+ area->mem.dmabuf_offset);
+}
- for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
- dma_addr_t dma = sg_dma_address(sg);
- unsigned long sg_len = sg_dma_len(sg);
- unsigned long sg_off = min(sg_len, off);
-
- off -= sg_off;
- sg_len -= sg_off;
- dma += sg_off;
+static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages)
+{
+ struct folio *last_folio = NULL;
+ unsigned long res = 0;
+ int i;
- while (sg_len && niov_idx < area->nia.num_niovs) {
- struct net_iov *niov = &area->nia.niovs[niov_idx];
+ for (i = 0; i < nr_pages; i++) {
+ struct folio *folio = page_folio(pages[i]);
- if (net_mp_niov_set_dma_addr(niov, dma))
- return 0;
- sg_len -= PAGE_SIZE;
- dma += PAGE_SIZE;
- niov_idx++;
- }
+ if (folio == last_folio)
+ continue;
+ last_folio = folio;
+ res += 1UL << folio_order(folio);
}
- return niov_idx;
+ return res;
}
static int io_import_umem(struct io_zcrx_ifq *ifq,
@@ -157,7 +182,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_area_reg *area_reg)
{
struct page **pages;
- int nr_pages;
+ int nr_pages, ret;
if (area_reg->dmabuf_fd)
return -EINVAL;
@@ -168,10 +193,23 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
if (IS_ERR(pages))
return PTR_ERR(pages);
+ ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
+ 0, nr_pages << PAGE_SHIFT,
+ GFP_KERNEL_ACCOUNT);
+ if (ret) {
+ unpin_user_pages(pages, nr_pages);
+ return ret;
+ }
+
+ mem->account_pages = io_count_account_pages(pages, nr_pages);
+ ret = io_account_mem(ifq->ctx, mem->account_pages);
+ if (ret < 0)
+ mem->account_pages = 0;
+
mem->pages = pages;
mem->nr_folios = nr_pages;
mem->size = area_reg->len;
- return 0;
+ return ret;
}
static void io_release_area_mem(struct io_zcrx_mem *mem)
@@ -182,6 +220,7 @@ static void io_release_area_mem(struct io_zcrx_mem *mem)
}
if (mem->pages) {
unpin_user_pages(mem->pages, mem->nr_folios);
+ sg_free_table(&mem->page_sg_table);
kvfree(mem->pages);
}
}
@@ -203,84 +242,54 @@ static int io_import_area(struct io_zcrx_ifq *ifq,
return io_import_umem(ifq, mem, area_reg);
}
-static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
+static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area)
{
int i;
- for (i = 0; i < nr_mapped; i++) {
- netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
- dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
-
- dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- }
-}
-
-static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
-{
- int i;
-
- if (area->mem.is_dmabuf)
- io_release_dmabuf(&area->mem);
- else
- io_zcrx_unmap_umem(ifq, area, nr_mapped);
+ guard(mutex)(&ifq->dma_lock);
+ if (!area->is_mapped)
+ return;
+ area->is_mapped = false;
for (i = 0; i < area->nia.num_niovs; i++)
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
-}
-static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
-{
- guard(mutex)(&ifq->dma_lock);
-
- if (area->is_mapped)
- __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
- area->is_mapped = false;
+ if (area->mem.is_dmabuf) {
+ io_release_dmabuf(&area->mem);
+ } else {
+ dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ }
}
-static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
- int i;
-
- for (i = 0; i < area->nia.num_niovs; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- dma_addr_t dma;
+ int ret;
- dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
- PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
- if (dma_mapping_error(ifq->dev, dma))
- break;
- if (net_mp_niov_set_dma_addr(niov, dma)) {
- dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- break;
- }
- }
- return i;
+ ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ if (ret < 0)
+ return ret;
+ return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0);
}
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
- unsigned nr;
+ int ret;
guard(mutex)(&ifq->dma_lock);
if (area->is_mapped)
return 0;
if (area->mem.is_dmabuf)
- nr = io_zcrx_map_area_dmabuf(ifq, area);
+ ret = io_zcrx_map_area_dmabuf(ifq, area);
else
- nr = io_zcrx_map_area_umem(ifq, area);
-
- if (nr != area->nia.num_niovs) {
- __io_zcrx_unmap_area(ifq, area, nr);
- return -EINVAL;
- }
+ ret = io_zcrx_map_area_umem(ifq, area);
- area->is_mapped = true;
- return 0;
+ if (ret == 0)
+ area->is_mapped = true;
+ return ret;
}
static void io_zcrx_sync_for_device(const struct page_pool *pool,
@@ -370,10 +379,12 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
static void io_zcrx_free_area(struct io_zcrx_area *area)
{
- if (area->ifq)
- io_zcrx_unmap_area(area->ifq, area);
+ io_zcrx_unmap_area(area->ifq, area);
io_release_area_mem(&area->mem);
+ if (area->mem.account_pages)
+ io_unaccount_mem(area->ifq->ctx, area->mem.account_pages);
+
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
@@ -401,6 +412,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
+ area->ifq = ifq;
ret = io_import_area(ifq, &area->mem, area_reg);
if (ret)
@@ -435,7 +447,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
}
area->free_count = nr_iovs;
- area->ifq = ifq;
/* we're only supporting one area per ifq for now */
area->area_id = 0;
area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
@@ -943,9 +954,54 @@ static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
return niov;
}
+struct io_copy_cache {
+ struct page *page;
+ unsigned long offset;
+ size_t size;
+};
+
+static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page,
+ unsigned int src_offset, size_t len)
+{
+ size_t copied = 0;
+
+ len = min(len, cc->size);
+
+ while (len) {
+ void *src_addr, *dst_addr;
+ struct page *dst_page = cc->page;
+ unsigned dst_offset = cc->offset;
+ size_t n = len;
+
+ if (folio_test_partial_kmap(page_folio(dst_page)) ||
+ folio_test_partial_kmap(page_folio(src_page))) {
+ dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE);
+ dst_offset = offset_in_page(dst_offset);
+ src_page = nth_page(src_page, src_offset / PAGE_SIZE);
+ src_offset = offset_in_page(src_offset);
+ n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset);
+ n = min(n, len);
+ }
+
+ dst_addr = kmap_local_page(dst_page) + dst_offset;
+ src_addr = kmap_local_page(src_page) + src_offset;
+
+ memcpy(dst_addr, src_addr, n);
+
+ kunmap_local(src_addr);
+ kunmap_local(dst_addr);
+
+ cc->size -= n;
+ cc->offset += n;
+ len -= n;
+ copied += n;
+ }
+ return copied;
+}
+
static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
- void *src_base, struct page *src_page,
- unsigned int src_offset, size_t len)
+ struct page *src_page, unsigned int src_offset,
+ size_t len)
{
struct io_zcrx_area *area = ifq->area;
size_t copied = 0;
@@ -955,11 +1011,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
return -EFAULT;
while (len) {
- size_t copy_size = min_t(size_t, PAGE_SIZE, len);
- const int dst_off = 0;
+ struct io_copy_cache cc;
struct net_iov *niov;
- struct page *dst_page;
- void *dst_addr;
+ size_t n;
niov = io_zcrx_alloc_fallback(area);
if (!niov) {
@@ -967,27 +1021,22 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
break;
}
- dst_page = io_zcrx_iov_page(niov);
- dst_addr = kmap_local_page(dst_page);
- if (src_page)
- src_base = kmap_local_page(src_page);
-
- memcpy(dst_addr, src_base + src_offset, copy_size);
+ cc.page = io_zcrx_iov_page(niov);
+ cc.offset = 0;
+ cc.size = PAGE_SIZE;
- if (src_page)
- kunmap_local(src_base);
- kunmap_local(dst_addr);
+ n = io_copy_page(&cc, src_page, src_offset, len);
- if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) {
+ if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) {
io_zcrx_return_niov(niov);
ret = -ENOSPC;
break;
}
io_zcrx_get_niov_uref(niov);
- src_offset += copy_size;
- len -= copy_size;
- copied += copy_size;
+ src_offset += n;
+ len -= n;
+ copied += n;
}
return copied ? copied : ret;
@@ -997,19 +1046,8 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
const skb_frag_t *frag, int off, int len)
{
struct page *page = skb_frag_page(frag);
- u32 p_off, p_len, t, copied = 0;
- int ret = 0;
- off += skb_frag_off(frag);
-
- skb_frag_foreach_page(frag, off, len,
- page, p_off, p_len, t) {
- ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len);
- if (ret < 0)
- return copied ? copied : ret;
- copied += ret;
- }
- return copied;
+ return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
}
static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
@@ -1066,8 +1104,9 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
size_t to_copy;
to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
- copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL,
- offset, to_copy);
+ copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data),
+ offset_in_page(skb->data) + offset,
+ to_copy);
if (copied < 0) {
ret = copied;
goto out;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 2f5e26389f22..109c4ca36434 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -14,6 +14,8 @@ struct io_zcrx_mem {
struct page **pages;
unsigned long nr_folios;
+ struct sg_table page_sg_table;
+ unsigned long account_pages;
struct dma_buf_attachment *attach;
struct dma_buf *dmabuf;
diff --git a/net/socket.c b/net/socket.c
index 9a0e720f0859..2cab805943c0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -843,6 +843,52 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb,
sizeof(ts_pktinfo), &ts_pktinfo);
}
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk)
+{
+ const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ u32 tsflags = READ_ONCE(sk->sk_tsflags);
+
+ if (serr->ee.ee_errno != ENOMSG ||
+ serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING)
+ return false;
+
+ /* software time stamp available and wanted */
+ if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp)
+ return true;
+ /* hardware time stamps available and wanted */
+ return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ skb_hwtstamps(skb)->hwtstamp;
+}
+
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+ struct timespec64 *ts)
+{
+ u32 tsflags = READ_ONCE(sk->sk_tsflags);
+ ktime_t hwtstamp;
+ int if_index = 0;
+
+ if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
+ ktime_to_timespec64_cond(skb->tstamp, ts))
+ return SOF_TIMESTAMPING_TX_SOFTWARE;
+
+ if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) ||
+ skb_is_swtx_tstamp(skb, false))
+ return -ENOENT;
+
+ if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
+ hwtstamp = get_timestamp(sk, skb, &if_index);
+ else
+ hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+
+ if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
+ hwtstamp = ptp_convert_timestamp(&hwtstamp,
+ READ_ONCE(sk->sk_bind_phc));
+ if (!ktime_to_timespec64_cond(hwtstamp, ts))
+ return -ENOENT;
+
+ return SOF_TIMESTAMPING_TX_HARDWARE;
+}
+
/*
* called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
*/