From d42eb05e60fea31de49897d63a1d73f933303bd4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Jan 2026 08:24:02 -0700 Subject: io_uring: add support for BPF filtering for opcode restrictions Add support for loading classic BPF programs with io_uring to provide fine-grained filtering of SQE operations. Unlike IORING_REGISTER_RESTRICTIONS which only allows bitmap-based allow/deny of opcodes, BPF filters can inspect request attributes and make dynamic decisions. The filter is registered via IORING_REGISTER_BPF_FILTER with a struct io_uring_bpf: struct io_uring_bpf_filter { __u32 opcode; /* io_uring opcode to filter */ __u32 flags; __u32 filter_len; /* number of BPF instructions */ __u32 resv; __u64 filter_ptr; /* pointer to BPF filter */ __u64 resv2[5]; }; enum { IO_URING_BPF_CMD_FILTER = 1, }; struct io_uring_bpf { __u16 cmd_type; /* IO_URING_BPF_* values */ __u16 cmd_flags; /* none so far */ __u32 resv; union { struct io_uring_bpf_filter filter; }; }; and the filters get supplied a struct io_uring_bpf_ctx: struct io_uring_bpf_ctx { __u64 user_data; __u8 opcode; __u8 sqe_flags; __u8 pdu_size; __u8 pad[5]; }; where it's possible to filter on opcode and sqe_flags, with pdu_size indicating how much extra data is being passed in beyond the pad field. This will used for specific finer grained filtering inside an opcode. An example of that for sockets is in one of the following patches. Anything the opcode supports can end up in this struct, populated by the opcode itself, and hence can be filtered for. Filters have the following semantics: - Return 1 to allow the request - Return 0 to deny the request with -EACCES - Multiple filters can be stacked per opcode. All filters must return 1 for the opcode to be allowed. - Filters are evaluated in registration order (most recent first) The implementation uses classic BPF (cBPF) rather than eBPF for as that's required for containers, and since they can be used by any user in the system. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dc6bd6940a0d..74bf98362876 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -219,9 +219,18 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; +struct io_bpf_filter; +struct io_bpf_filters { + refcount_t refs; /* ref for ->bpf_filters */ + spinlock_t lock; /* protects ->bpf_filters modifications */ + struct io_bpf_filter __rcu **filters; + struct rcu_head rcu_head; +}; + struct io_restriction { DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + struct io_bpf_filters *bpf_filters; u8 sqe_flags_allowed; u8 sqe_flags_required; /* IORING_OP_* restrictions exist */ -- cgit v1.2.3 From e7c30675a7fb79d94400987865a3bd620458ca1a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Jan 2026 08:27:23 -0700 Subject: io_uring/bpf_filter: cache lookup table in ctx->bpf_filters Currently a few pointer dereferences need to be made to both check if BPF filters are installed, and then also to retrieve the actual filter for the opcode. Cache the table in ctx->bpf_filters to avoid that. Add a bit of debug info on ring exit to show if we ever got this wrong. Small risk of that given that the table is currently only updated in one spot, but once task forking is enabled, that will add one more spot. Reviewed-by: Christian Brauner Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 74bf98362876..7617df247238 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -287,6 +287,8 @@ struct io_ring_ctx { struct task_struct *submitter_task; struct io_rings *rings; + /* cache of ->restrictions.bpf_filters->filters */ + struct io_bpf_filter __rcu **bpf_filters; struct percpu_ref refs; clockid_t clockid; -- cgit v1.2.3 From 9fd99788f3e5a129908c242bb29946077ca46611 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Jan 2026 15:28:58 -0700 Subject: io_uring: add task fork hook Called when copy_process() is called to copy state to a new child. Right now this is just a stub, but will be used shortly to properly handle fork'ing of task based io_uring restrictions. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 14 +++++++++++++- include/linux/sched.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 85fe4e6b275c..d1aa4edfc2a5 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -12,6 +12,7 @@ void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); bool io_is_uring_fops(struct file *file); +int __io_uring_fork(struct task_struct *tsk); static inline void io_uring_files_cancel(void) { @@ -25,9 +26,16 @@ static inline void io_uring_task_cancel(void) } static inline void io_uring_free(struct task_struct *tsk) { - if (tsk->io_uring) + if (tsk->io_uring || tsk->io_uring_restrict) __io_uring_free(tsk); } +static inline int io_uring_fork(struct task_struct *tsk) +{ + if (tsk->io_uring_restrict) + return __io_uring_fork(tsk); + + return 0; +} #else static inline void io_uring_task_cancel(void) { @@ -46,6 +54,10 @@ static inline bool io_is_uring_fops(struct file *file) { return false; } +static inline int io_uring_fork(struct task_struct *tsk) +{ + return 0; +} #endif #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index d395f2810fac..9abbd11bb87c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1190,6 +1190,7 @@ struct task_struct { #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; + struct io_restriction *io_uring_restrict; #endif /* Namespaces: */ -- cgit v1.2.3 From ed82f35b926b2e505c14b7006473614b8f58b4f4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Jan 2026 10:18:31 -0700 Subject: io_uring: allow registration of per-task restrictions Currently io_uring supports restricting operations on a per-ring basis. To use those, the ring must be setup in a disabled state by setting IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and the ring can then be enabled. This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd == -1, like the other "blind" register opcodes which work on the task rather than a specific ring. This allows registration of the same kind of restrictions as can been done on a specific ring, but with the task itself. Once done, any ring created will inherit these restrictions. If a restriction filter is registered with a task, then it's inherited on fork for its children. Children may only further restrict operations, not extend them. Inheriting restrictions include both the classic IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF filters that have been registered with the task via IORING_REGISTER_BPF_FILTER. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7617df247238..510d801b9a55 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -231,6 +231,8 @@ struct io_restriction { DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); DECLARE_BITMAP(sqe_op, IORING_OP_LAST); struct io_bpf_filters *bpf_filters; + /* ->bpf_filters needs COW on modification */ + bool bpf_filters_cow; u8 sqe_flags_allowed; u8 sqe_flags_required; /* IORING_OP_* restrictions exist */ -- cgit v1.2.3