diff options
Diffstat (limited to 'io_uring')
| -rw-r--r-- | io_uring/eventfd.c | 1 | ||||
| -rw-r--r-- | io_uring/kbuf.c | 9 | ||||
| -rw-r--r-- | io_uring/kbuf.h | 8 | ||||
| -rw-r--r-- | io_uring/napi.c | 29 | ||||
| -rw-r--r-- | io_uring/napi.h | 8 | ||||
| -rw-r--r-- | io_uring/timeout.c | 35 | ||||
| -rw-r--r-- | io_uring/tw.c | 12 | ||||
| -rw-r--r-- | io_uring/wait.c | 6 | ||||
| -rw-r--r-- | io_uring/zcrx.c | 3 |
9 files changed, 81 insertions, 30 deletions
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 3da028500f76..d656cc2a0b9b 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -43,6 +43,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + atomic_andnot(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); io_eventfd_put(ev_fd); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8da2ff798170..63061aa1cab9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -47,7 +47,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) this_len = min_t(u32, len, buf_len); buf_len -= this_len; /* Stop looping for invalid buffer length of 0 */ - if (buf_len || !this_len) { + if (buf_len > bl->min_left_sub_one || !this_len) { WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); WRITE_ONCE(buf->len, buf_len); return false; @@ -637,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; + /* minimum left byte count is a property of incremental buffers */ + if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left) + return -EINVAL; + bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -680,10 +684,11 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } #endif - bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; bl->flags |= IOBL_BUF_RING; bl->buf_ring = br; + if (reg.min_left) + bl->min_left_sub_one = reg.min_left - 1; if (reg.flags & IOU_PBUF_RING_INC) bl->flags |= IOBL_INC; ret = io_buffer_add_list(ctx, bl, reg.bgid); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index bf15e26520d3..401773e1ef80 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -27,12 +27,18 @@ struct io_buffer_list { __u16 bgid; /* below is for ring provided buffers */ - __u16 nr_entries; __u16 head; __u16 mask; __u16 flags; + /* + * minimum required amount to be left to reuse an incrementally + * consumed buffer. If less than this is left at consumption time, + * buffer is done and head is incremented to the next buffer. + */ + __u32 min_left_sub_one; + struct io_mapped_region region; }; diff --git a/io_uring/napi.c b/io_uring/napi.c index 4a10de03e426..bfc771445912 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -38,7 +38,8 @@ static inline ktime_t net_to_ktime(unsigned long t) return ns_to_ktime(t << 10); } -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode) { struct hlist_head *hash_list; struct io_napi_entry *e; @@ -69,6 +70,11 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) * kfree() */ spin_lock(&ctx->napi_lock); + if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return -EINVAL; + } if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); @@ -196,9 +202,14 @@ __io_napi_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { - if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + switch (READ_ONCE(ctx->napi_track_mode)) { + case IO_URING_NAPI_TRACKING_STATIC: return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); - return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + case IO_URING_NAPI_TRACKING_DYNAMIC: + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + default: + return false; + } } static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, @@ -273,11 +284,13 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx, default: return -EINVAL; } - /* clean the napi list for new settings */ + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); io_napi_free(ctx); - WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + /* cap NAPI at 10 msec of spin time */ + napi->busy_poll_to = min(10000, napi->busy_poll_to); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); return 0; } @@ -313,7 +326,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) case IO_URING_NAPI_STATIC_ADD_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; - return __io_napi_add_id(ctx, napi.op_param); + return __io_napi_add_id(ctx, napi.op_param, + IO_URING_NAPI_TRACKING_STATIC); case IO_URING_NAPI_STATIC_DEL_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; @@ -341,9 +355,10 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); + io_napi_free(ctx); return 0; } diff --git a/io_uring/napi.h b/io_uring/napi.h index fa742f42e09b..e0aecccc5065 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -15,7 +15,8 @@ void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -43,13 +44,14 @@ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; + unsigned int mode = IO_URING_NAPI_TRACKING_DYNAMIC; - if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) + if (READ_ONCE(ctx->napi_track_mode) != mode) return; sock = sock_from_file(req->file); if (sock && sock->sk) - __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id), mode); } #else diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4cfdfc519770..e2595cae2b07 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> +#include <linux/time_namespace.h> #include <trace/events/io_uring.h> @@ -35,6 +36,22 @@ struct io_timeout_rem { bool ltimeout; }; +static clockid_t io_flags_to_clock(unsigned flags) +{ + switch (flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) { struct timespec64 ts; @@ -43,7 +60,7 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) *time = ns_to_ktime(arg); if (*time < 0) return -EINVAL; - return 0; + goto out; } if (get_timespec64(&ts, u64_to_user_ptr(arg))) @@ -51,6 +68,9 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) if (ts.tv_sec < 0 || ts.tv_nsec < 0) return -EINVAL; *time = timespec64_to_ktime(ts); +out: + if (flags & IORING_TIMEOUT_ABS) + *time = timens_ktime_to_host(io_flags_to_clock(flags), *time); return 0; } @@ -399,18 +419,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static clockid_t io_timeout_get_clock(struct io_timeout_data *data) { - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } + return io_flags_to_clock(data->flags); } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, diff --git a/io_uring/tw.c b/io_uring/tw.c index fdff81eebc95..023d5e6bc491 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -273,8 +273,18 @@ void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { - struct llist_node *node = llist_del_all(&ctx->work_llist); + struct llist_node *node; + /* + * Running the work items may utilize ->retry_llist as a means + * for capping the number of task_work entries run at the same + * time. But that list can potentially race with moving the work + * from here, if the task is exiting. As any normal task_work + * running holds ->uring_lock already, just guard this slow path + * with ->uring_lock to avoid racing on ->retry_llist. + */ + guard(mutex)(&ctx->uring_lock); + node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); node = llist_del_all(&ctx->retry_llist); __io_fallback_tw(node, false); diff --git a/io_uring/wait.c b/io_uring/wait.c index 91df86ce0d18..ec01e78a216d 100644 --- a/io_uring/wait.c +++ b/io_uring/wait.c @@ -5,6 +5,7 @@ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/io_uring.h> +#include <linux/time_namespace.h> #include <trace/events/io_uring.h> @@ -229,7 +230,10 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (ext_arg->ts_set) { iowq.timeout = timespec64_to_ktime(ext_arg->ts); - if (!(flags & IORING_ENTER_ABS_TIMER)) + if (flags & IORING_ENTER_ABS_TIMER) + iowq.timeout = timens_ktime_to_host(ctx->clockid, + iowq.timeout); + else iowq.timeout = ktime_add(iowq.timeout, start_time); } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 7b93c87b8371..19837e0b5e91 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -495,10 +495,9 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, for (i = 0; i < nr_iovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; - niov->owner = &area->nia; + net_iov_init(niov, &area->nia, NET_IOV_IOURING); area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); - niov->type = NET_IOV_IOURING; } if (ifq->dev) { |
