From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/linux/bpf.h | 16 ++++++++++++++++ include/uapi/linux/bpf.h | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..28d8d6b7bb1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1739,6 +1739,8 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + struct mutex st_ops_assoc_mutex; + struct bpf_map __rcu *st_ops_assoc; }; struct bpf_prog { @@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET @@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + return -EOPNOTSUPP; +} +static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ +} +static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + return NULL; +} static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 93f0d09697613beba922a387d21a09a41eeefef5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:17 -0800 Subject: bpf: move recursion detection logic to helpers BPF programs detect recursion by doing atomic inc/dec on a per-cpu active counter from the trampoline. Create two helpers for operations on this active counter, this makes it easy to changes the recursion detection logic in future. This commit makes no functional changes. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb3847caeae1..2da986136d26 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2004,6 +2004,16 @@ struct bpf_struct_ops_common_value { enum bpf_struct_ops_state state; }; +static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) +{ + return this_cpu_inc_return(*(prog->active)) == 1; +} + +static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) +{ + this_cpu_dec(*(prog->active)); +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register -- cgit v1.2.3 From c3e34f88f9992866a1fb510850921a8fe299a97b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:18 -0800 Subject: bpf: arm64: Optimize recursion detection by not using atomics BPF programs detect recursion using a per-CPU 'active' flag in struct bpf_prog. The trampoline currently sets/clears this flag with atomic operations. On some arm64 platforms (e.g., Neoverse V2 with LSE), per-CPU atomic operations are relatively slow. Unlike x86_64 - where per-CPU updates can avoid cross-core atomicity, arm64 LSE atomics are always atomic across all cores, which is unnecessary overhead for strictly per-CPU state. This patch removes atomics from the recursion detection path on arm64 by changing 'active' to a per-CPU array of four u8 counters, one per context: {NMI, hard-irq, soft-irq, normal}. The running context uses a non-atomic increment/decrement on its element. After increment, recursion is detected by reading the array as a u32 and verifying that only the expected element changed; any change in another element indicates inter-context recursion, and a value > 1 in the same element indicates same-context recursion. For example, starting from {0,0,0,0}, a normal-context trigger changes the array to {0,0,0,1}. If an NMI arrives on the same CPU and triggers the program, the array becomes {1,0,0,1}. When the NMI context checks the u32 against the expected mask for normal (0x00000001), it observes 0x01000001 and correctly reports recursion. Same-context recursion is detected analogously. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2da986136d26..da6a00dd313f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1746,6 +1746,8 @@ struct bpf_prog_aux { struct bpf_map __rcu *st_ops_assoc; }; +#define BPF_NR_CONTEXTS 4 /* normal, softirq, hardirq, NMI */ + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -1772,7 +1774,7 @@ struct bpf_prog { u8 tag[BPF_TAG_SIZE]; }; struct bpf_prog_stats __percpu *stats; - int __percpu *active; + u8 __percpu *active; /* u8[BPF_NR_CONTEXTS] for recursion protection */ unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ @@ -2006,12 +2008,36 @@ struct bpf_struct_ops_common_value { static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) { - return this_cpu_inc_return(*(prog->active)) == 1; +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + u32 val; + + preempt_disable(); + active[rctx]++; + val = le32_to_cpu(*(__le32 *)active); + preempt_enable(); + if (val != BIT(rctx * 8)) + return false; + + return true; +#else + return this_cpu_inc_return(*(int __percpu *)(prog->active)) == 1; +#endif } static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) { - this_cpu_dec(*(prog->active)); +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + + preempt_disable(); + active[rctx]--; + preempt_enable(); +#else + this_cpu_dec(*(int __percpu *)(prog->active)); +#endif } #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) -- cgit v1.2.3 From 4221de8c410e7adb1114a4374c78fa4269175343 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:51 -0800 Subject: mm: declare memcg_page_state_output() in memcontrol.h To use memcg_page_state_output() in bpf_memcontrol.c move the declaration from v1-specific memcontrol-v1.h to memcontrol.h. Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20251223044156.208250-2-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0651865a4564..7bef427d5a82 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,6 +950,7 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1373,6 +1374,11 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return 0; } +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { -- cgit v1.2.3 From 99430ab8b804c26b8a0dec93fcbfe75469f3edc7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:54 -0800 Subject: mm: introduce BPF kfuncs to access memcg statistics and events Introduce BPF kfuncs to conveniently access memcg data: - bpf_mem_cgroup_vm_events(), - bpf_mem_cgroup_memory_events(), - bpf_mem_cgroup_usage(), - bpf_mem_cgroup_page_state(), - bpf_mem_cgroup_flush_stats(). These functions are useful for implementing BPF OOM policies, but also can be used to accelerate access to the memcg data. Reading it through cgroupfs is much more expensive, roughly 5x, mostly because of the need to convert the data into the text and back. JP Kobryn: An experiment was setup to compare the performance of a program that uses the traditional method of reading memory.stat vs a program using the new kfuncs. The control program opens up the root memory.stat file and for 1M iterations reads, converts the string values to numeric data, then seeks back to the beginning. The experimental program sets up the requisite libbpf objects and for 1M iterations invokes a bpf program which uses the kfuncs to fetch all available stats for node_stat_item, memcg_stat_item, and vm_event_item types. The results showed a significant perf benefit on the experimental side, outperforming the control side by a margin of 93%. In kernel mode, elapsed time was reduced by 80%, while in user mode, over 99% of time was saved. control: elapsed time real 0m38.318s user 0m25.131s sys 0m13.070s experiment: elapsed time real 0m2.789s user 0m0.187s sys 0m2.512s control: perf data 33.43% a.out libc.so.6 [.] __vfscanf_internal 6.88% a.out [kernel.kallsyms] [k] vsnprintf 6.33% a.out libc.so.6 [.] _IO_fgets 5.51% a.out [kernel.kallsyms] [k] format_decode 4.31% a.out libc.so.6 [.] __GI_____strtoull_l_internal 3.78% a.out [kernel.kallsyms] [k] string 3.53% a.out [kernel.kallsyms] [k] number 2.71% a.out libc.so.6 [.] _IO_sputbackc 2.41% a.out [kernel.kallsyms] [k] strlen 1.98% a.out a.out [.] main 1.70% a.out libc.so.6 [.] _IO_getline_info 1.51% a.out libc.so.6 [.] __isoc99_sscanf 1.47% a.out [kernel.kallsyms] [k] memory_stat_format 1.47% a.out [kernel.kallsyms] [k] memcpy_orig 1.41% a.out [kernel.kallsyms] [k] seq_buf_printf experiment: perf data 10.55% memcgstat bpf_prog_..._query [k] bpf_prog_16aab2f19fa982a7_query 6.90% memcgstat [kernel.kallsyms] [k] memcg_page_state_output 3.55% memcgstat [kernel.kallsyms] [k] _raw_spin_lock 3.12% memcgstat [kernel.kallsyms] [k] memcg_events 2.87% memcgstat [kernel.kallsyms] [k] __memcg_slab_post_alloc_hook 2.73% memcgstat [kernel.kallsyms] [k] kmem_cache_free 2.70% memcgstat [kernel.kallsyms] [k] entry_SYSRETQ_unsafe_stack 2.25% memcgstat [kernel.kallsyms] [k] __memcg_slab_free_hook 2.06% memcgstat [kernel.kallsyms] [k] get_page_from_freelist Signed-off-by: Roman Gushchin Co-developed-by: JP Kobryn Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20251223044156.208250-5-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bef427d5a82..6a5d65487b70 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -949,8 +949,12 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } +unsigned long memcg_events(struct mem_cgroup *memcg, int event); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); +bool memcg_stat_item_valid(int idx); +bool memcg_vm_event_item_valid(enum vm_event_item idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1379,6 +1383,16 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, in return 0; } +static inline bool memcg_stat_item_valid(int idx) +{ + return false; +} + +static inline bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + return false; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { -- cgit v1.2.3 From b8467290edab4bafae352bf3f317055669a1a458 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:18 -0800 Subject: bpf: arena: make arena kfuncs any context safe Make arena related kfuncs any context safe by the following changes: bpf_arena_alloc_pages() and bpf_arena_reserve_pages(): Replace the usage of the mutex with a rqspinlock for range tree and use kmalloc_nolock() wherever needed. Use free_pages_nolock() to free pages from any context. apply_range_set/clear_cb() with apply_to_page_range() has already made populating the vm_area in bpf_arena_alloc_pages() any context safe. bpf_arena_free_pages(): defer the main logic to a workqueue if it is called from a non-sleepable context. specialize_kfunc() is used to replace the sleepable arena_free_pages() with bpf_arena_free_pages_non_sleepable() when the verifier detects the call is from a non-sleepable context. In the non-sleepable case, arena_free_pages() queues the address and the page count to be freed to a lock-less list of struct arena_free_spans and raises an irq_work. The irq_work handler calls schedules_work() as it is safe to be called from irq context. arena_free_worker() (the work queue handler) iterates these spans and clears ptes, flushes tlb, zaps pages, and calls __free_page(). Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da6a00dd313f..4e7d72dfbcd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -673,6 +673,22 @@ void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit); +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +#else +static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} + +static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ +} +#endif + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of -- cgit v1.2.3 From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:28 -0800 Subject: bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the flag itself. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/btf.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e7d72dfbcd4..9efb2ddf331c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -753,7 +753,7 @@ enum bpf_type_flag { MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be - * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * passed to kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is diff --git a/include/linux/btf.h b/include/linux/btf.h index f06976ffb63f..691f09784933 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -34,7 +34,7 @@ * * And the following kfunc: * - * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE) * * All invocations to the kfunc must pass the unmodified, unwalked task: * @@ -66,7 +66,6 @@ * return 0; * } */ -#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ -- cgit v1.2.3 From 817593af7b9ba56c23d9dd1858232c5493a14f55 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 12:02:27 -0800 Subject: bpf: syscall: Introduce memcg enter/exit helpers Introduce bpf_map_memcg_enter() and bpf_map_memcg_exit() helpers to reduce code duplication in memcg context management. bpf_map_memcg_enter() gets the memcg from the map, sets it as active, and returns both the previous and the now active memcg. bpf_map_memcg_exit() restores the previous active memcg and releases the reference obtained during enter. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102200230.25168-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9efb2ddf331c..4e9667ed6630 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2608,6 +2608,10 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg); +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg); void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, @@ -2632,6 +2636,17 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, kvcalloc(_n, _size, _flags) #define bpf_map_alloc_percpu(_map, _size, _align, _flags) \ __alloc_percpu_gfp(_size, _align, _flags) +static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = NULL; + *old_memcg = NULL; +} + +static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg) +{ +} #endif static inline int -- cgit v1.2.3 From a069190b590e108223cd841a1c2d0bfb92230ecc Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 14:15:12 -0800 Subject: bpf: Replace __opt annotation with __nullable for kfuncs The __opt annotation was originally introduced specifically for buffer/size argument pairs in bpf_dynptr_slice() and bpf_dynptr_slice_rdwr(), allowing the buffer pointer to be NULL while still validating the size as a constant. The __nullable annotation serves the same purpose but is more general and is already used throughout the BPF subsystem for raw tracepoints, struct_ops, and other kfuncs. This patch unifies the two annotations by replacing __opt with __nullable. The key change is in the verifier's get_kfunc_ptr_arg_type() function, where mem/size pair detection is now performed before the nullable check. This ensures that buffer/size pairs are correctly classified as KF_ARG_PTR_TO_MEM_SIZE even when the buffer is nullable, while adding an !arg_mem_size condition to the nullable check prevents interference with mem/size pair handling. When processing KF_ARG_PTR_TO_MEM_SIZE arguments, the verifier now uses is_kfunc_arg_nullable() instead of the removed is_kfunc_arg_optional() to determine whether to skip size validation for NULL buffers. This is the first documentation added for the __nullable annotation, which has been in use since it was introduced but was previously undocumented. No functional changes to verifier behavior - nullable buffer/size pairs continue to work exactly as before. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102221513.1961781-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e9667ed6630..a63e47d2109c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1434,7 +1434,7 @@ bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags); void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk); + void *buffer__nullable, u64 buffer__szk); static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { -- cgit v1.2.3 From ea180ffbd27ce5abf2a06329fe1fc8d20dc9becf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 5 Jan 2026 20:23:13 -0800 Subject: mm: drop mem_cgroup_usage() declaration from memcontrol.h mem_cgroup_usage() is not used outside of memcg-v1 code, the declaration was added by a mistake. Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20260106042313.140256-1-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6a5d65487b70..229ac9835adb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,7 +950,6 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); bool memcg_stat_item_valid(int idx); -- cgit v1.2.3 From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ -- cgit v1.2.3 From 8eb76cb03f0f6c2bd7b15cf45dcffcd6bd07a360 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:17 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_array maps Introduce support for the BPF_F_ALL_CPUS flag in percpu_array maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce support for the BPF_F_CPU flag in percpu_array maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags of lookup_elem and update_elem APIs along with embedded cpu info. * elem_flags of lookup_batch and update_batch APIs along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 108bab1bda9d..dc92d001d978 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2848,7 +2848,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, @@ -3917,7 +3917,12 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { - return false; + switch (map_type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + return true; + default: + return false; + } } static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) -- cgit v1.2.3 From c6936161fd55d0c6ffde01da726e66ff5f2934e8 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:18 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_hash and lru_percpu_hash maps Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags along with embedded cpu info. * elem_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc92d001d978..f5c259dcc425 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2847,7 +2847,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -3919,6 +3919,8 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { switch (map_type) { case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: return true; default: return false; -- cgit v1.2.3 From 47c79f05aa0d289f760caf5ad6521fd8dbae37a1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:20 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_cgroup_storage maps Introduce BPF_F_ALL_CPUS flag support for percpu_cgroup_storage maps to allow updating values for all CPUs with a single value for update_elem API. Introduce BPF_F_CPU flag support for percpu_cgroup_storage maps to allow: * update value for specified CPU for update_elem API. * lookup value for specified CPU for lookup_elem API. The BPF_F_CPU flag is passed via map_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 4 ++-- include/linux/bpf.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d1eb5c7729cb..2f535331f926 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -172,7 +172,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); -int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -470,7 +470,7 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, - void *value) { + void *value, u64 flags) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f5c259dcc425..5936f8e2996f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3921,6 +3921,7 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: return true; default: return false; -- cgit v1.2.3 From 8c3070e159ba00424f0389ead694cacd85af260e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:58 +0800 Subject: btf: Optimize type lookup with binary search Improve btf_find_by_name_kind() performance by adding binary search support for sorted types. Falls back to linear search for compatibility. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-7-dolinux.peng@gmail.com --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 691f09784933..78dc79810c7d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -219,6 +219,7 @@ bool btf_is_module(const struct btf *btf); bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); +u32 btf_named_start_id(const struct btf *btf, bool own); struct btf *btf_base_btf(const struct btf *btf); bool btf_type_is_i32(const struct btf_type *t); bool btf_type_is_i64(const struct btf_type *t); -- cgit v1.2.3 From 276f3b6daf6024ae2742afd161e7418a5584a660 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:56 +0100 Subject: arm64/ftrace,bpf: Fix partial regs after bpf_prog_run Mahe reported issue with bpf_override_return helper not working when executed from kprobe.multi bpf program on arm. The problem is that on arm we use alternate storage for pt_regs object that is passed to bpf_prog_run and if any register is changed (which is the case of bpf_override_return) it's not propagated back to actual pt_regs object. Fixing this by introducing and calling ftrace_partial_regs_update function to propagate the values of changed registers (ip and stack). Reported-by: Mahe Tardy Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Acked-by: Will Deacon Link: https://lore.kernel.org/bpf/20260112121157.854473-1-jolsa@kernel.org --- include/linux/ftrace_regs.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index 15627ceea9bc..386fa48c4a95 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -33,6 +33,31 @@ struct ftrace_regs; #define ftrace_regs_get_frame_pointer(fregs) \ frame_pointer(&arch_ftrace_regs(fregs)->regs) +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) { } + +#else + +/* + * ftrace_partial_regs_update - update the original ftrace_regs from regs + * @fregs: The ftrace_regs to update from @regs + * @regs: The partial regs from ftrace_partial_regs() that was updated + * + * Some architectures have the partial regs living in the ftrace_regs + * structure, whereas other architectures need to make a different copy + * of the @regs. If a partial @regs is retrieved by ftrace_partial_regs() and + * if the code using @regs updates a field (like the instruction pointer or + * stack pointer) it may need to propagate that change to the original @fregs + * it retrieved the partial @regs from. Use this function to guarantee that + * update happens. + */ +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + ftrace_regs_set_instruction_pointer(fregs, instruction_pointer(regs)); + ftrace_regs_set_return_value(fregs, regs_return_value(regs)); +} + #endif /* HAVE_ARCH_FTRACE_REGS */ /* This can be overridden by the architectures */ -- cgit v1.2.3 From f81c07a6e98e3171d0c4c5ab79f5aeff71b42c44 Mon Sep 17 00:00:00 2001 From: Qiliang Yuan Date: Tue, 20 Jan 2026 10:32:34 +0800 Subject: bpf/verifier: Optimize ID mapping reset in states_equal Currently, reset_idmap_scratch() performs a 4.7KB memset() in every states_equal() call. Optimize this by using a counter to track used ID mappings, replacing the O(N) memset() with an O(1) reset and bounding the search loop in check_ids(). Signed-off-by: Qiliang Yuan Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260120023234.77673-1-realwujing@gmail.com --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 130bcbd66f60..8355b585cd18 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -692,6 +692,7 @@ struct bpf_id_pair { struct bpf_idmap { u32 tmp_id_gen; + u32 cnt; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; -- cgit v1.2.3 From ea073d1818e228440275cc90047b4ef0fddd6eb5 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:26 -0800 Subject: bpf: Refactor btf_kfunc_id_set_contains btf_kfunc_id_set_contains() is called by fetch_kfunc_meta() in the BPF verifier to get the kfunc flags stored in the .BTF_ids ELF section. If it returns NULL instead of a valid pointer, it's interpreted as an illegal kfunc usage failing the verification. There are two potential reasons for btf_kfunc_id_set_contains() to return NULL: 1. Provided kfunc BTF id is not present in relevant kfunc id sets. 2. The kfunc is not allowed, as determined by the program type specific filter [1]. The filter functions accept a pointer to `struct bpf_prog`, so they might implicitly depend on earlier stages of verification, when bpf_prog members are set. For example, bpf_qdisc_kfunc_filter() in linux/net/sched/bpf_qdisc.c inspects prog->aux->st_ops [2], which is initialized in: check_attach_btf_id() -> check_struct_ops_btf_id() So far this hasn't been an issue, because fetch_kfunc_meta() is the only caller of btf_kfunc_id_set_contains(). However in subsequent patches of this series it is necessary to inspect kfunc flags earlier in BPF verifier, in the add_kfunc_call(). To resolve this, refactor btf_kfunc_id_set_contains() into two interface functions: * btf_kfunc_flags() that simply returns pointer to kfunc_flags without applying the filters * btf_kfunc_is_allowed() that both checks for kfunc_flags existence (which is a requirement for a kfunc to be allowed) and applies the prog filters See [3] for the previous version of this patch. [1] https://lore.kernel.org/all/20230519225157.760788-7-aditi.ghag@isovalent.com/ [2] https://lore.kernel.org/all/20250409214606.2000194-4-ameryhung@gmail.com/ [3] https://lore.kernel.org/bpf/20251029190113.3323406-3-ihor.solodrai@linux.dev/ Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 78dc79810c7d..a2f4f383f5b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -575,8 +575,8 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset); const char *btf_str_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); -u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, - const struct bpf_prog *prog); +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); +bool btf_kfunc_is_allowed(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, -- cgit v1.2.3 From 64e1360524b9ef5835714669b5876e122a23e6fc Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:28 -0800 Subject: bpf: Verifier support for KF_IMPLICIT_ARGS A kernel function bpf_foo marked with KF_IMPLICIT_ARGS flag is expected to have two associated types in BTF: * `bpf_foo` with a function prototype that omits implicit arguments * `bpf_foo_impl` with a function prototype that matches the kernel declaration of `bpf_foo`, but doesn't have a ksym associated with its name In order to support kfuncs with implicit arguments, the verifier has to know how to resolve a call of `bpf_foo` to the correct BTF function prototype and address. To implement this, in add_kfunc_call() kfunc flags are checked for KF_IMPLICIT_ARGS. For such kfuncs a BTF func prototype is adjusted to the one found for `bpf_foo_impl` (func_name + "_impl" suffix, by convention) function in BTF. This effectively changes the signature of the `bpf_foo` kfunc in the context of verification: from one without implicit args to the one with full argument list. The values of implicit arguments by design are provided by the verifier, and so they can only be of particular types. In this patch the only allowed implicit arg type is a pointer to struct bpf_prog_aux. In order for the verifier to correctly set an implicit bpf_prog_aux arg value at runtime, is_kfunc_arg_prog() is extended to check for the arg type. At a point when prog arg is determined in check_kfunc_args() the kfunc with implicit args already has a prototype with full argument list, so the existing value patch mechanism just works. If a new kfunc with KF_IMPLICIT_ARG is declared for an existing kfunc that uses a __prog argument (a legacy case), the prototype substitution works in exactly the same way, assuming the kfunc follows the _impl naming convention. The difference is only in how _impl prototype is added to the BTF, which is not the verifier's concern. See a subsequent resolve_btfids patch for details. __prog suffix is still supported at this point, but will be removed in a subsequent patch, after current users are moved to KF_IMPLICIT_ARGS. Introduction of KF_IMPLICIT_ARGS revealed an issue with zero-extension tracking, because an explicit rX = 0 in place of the verifier-supplied argument is now absent if the arg is implicit (the BPF prog doesn't pass a dummy NULL anymore). To mitigate this, reset the subreg_def of all caller saved registers in check_kfunc_call() [1]. [1] https://lore.kernel.org/bpf/b4a760ef828d40dac7ea6074d39452bb0dc82caa.camel@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-4-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a2f4f383f5b6..48108471c5b1 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -78,6 +78,7 @@ #define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ +#define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From 82f3b142c99cf44c7b1e70b7720169c646b9760f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 22 Jan 2026 03:59:11 -0800 Subject: rqspinlock: Fix TAS fallback lock entry creation The TAS fallback can be invoked directly when queued spin locks are disabled, and through the slow path when paravirt is enabled for queued spin locks. In the latter case, the res_spin_lock macro will attempt the fast path and already hold the entry when entering the slow path. This will lead to creation of extraneous entries that are not released, which may cause false positives for deadlock detection. Fix this by always preceding invocation of the TAS fallback in every case with the grabbing of the held lock entry, and add a comment to make note of this. Fixes: c9102a68c070 ("rqspinlock: Add a test-and-set fallback") Reported-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Tested-by: Amery Hung Link: https://lore.kernel.org/r/20260122115911.3668985-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/asm-generic/rqspinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 0f2dcbbfee2f..5c5cf2f7fc39 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -191,7 +191,7 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) #else -#define res_spin_lock(lock) resilient_tas_spin_lock(lock) +#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); }) #endif /* CONFIG_QUEUED_SPINLOCKS */ -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 +++++++++++++++++++ include/uapi/linux/bpf.h | 1 + 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 27d89baa6da8e5e546585c53a959176d1302d46e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:59 +0800 Subject: bpf: support fsession for bpf_session_is_return If fsession exists, we will use the bit (1 << BPF_TRAMP_IS_RETURN_SHIFT) in ((u64 *)ctx)[-1] to store the "is_return" flag. The logic of bpf_session_is_return() for fsession is implemented in the verifier by inline following code: bool bpf_session_is_return(void *ctx) { return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; } Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-5-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41228b0add52..29eecd79352e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,8 @@ enum { #endif }; +#define BPF_TRAMP_IS_RETURN_SHIFT 63 + struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; -- cgit v1.2.3 From eeee4239dbb155a98f8ba737324ac081acde8417 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:00 +0800 Subject: bpf: support fsession for bpf_session_cookie Implement session cookie for fsession. The session cookies will be stored in the stack, and the layout of the stack will look like this: return value -> 8 bytes argN -> 8 bytes ... arg1 -> 8 bytes nr_args -> 8 bytes ip (optional) -> 8 bytes cookie2 -> 8 bytes cookie1 -> 8 bytes The offset of the cookie for the current bpf program, which is in 8-byte units, is stored in the "(((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF". Therefore, we can get the session cookie with ((u64 *)ctx)[-offset]. Implement and inline the bpf_session_cookie() for the fsession in the verifier. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-6-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 29eecd79352e..4427c6e98331 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,7 @@ enum { #endif }; +#define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 struct bpf_tramp_links { @@ -1782,6 +1783,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ + call_session_cookie:1, /* Do we call bpf_session_cookie() */ tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ @@ -2190,6 +2192,19 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->call_session_cookie) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); -- cgit v1.2.3 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { -- cgit v1.2.3 From b40cc5adaa80e1471095a62d78233b611d7a558c Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:43 +0800 Subject: bpf, sockmap: Fix incorrect copied_seq calculation A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. The issue is that when reading from ingress_msg, we update tp->copied_seq by default. However, if the data is not from its own protocol stack, tcp->rcv_nxt is not increased. Later, if we convert this socket to a native socket, reading from this socket may fail because copied_seq might be significantly larger than rcv_nxt. This fix also addresses the syzkaller-reported bug referenced in the Closes tag. This patch marks the skmsg objects in ingress_msg. When reading, we update copied_seq only if the data is from its own protocol stack. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Closes: https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Reviewed-by: Jakub Sitnicki Reviewed-by: John Fastabend Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260124113314.113584-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 49847888c287..dfdc158ab88c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -141,6 +141,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) -- cgit v1.2.3 From 929e30f9312514902133c45e51c79088421ab084 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:44 +0800 Subject: bpf, sockmap: Fix FIONREAD for sockmap A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new msg_tot_len field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. Note that we intentionally do not include sk_receive_queue data in the FIONREAD result. Data in sk_receive_queue has not yet been processed by the BPF verdict program, and may be redirected to other sockets or dropped. Including it would create semantic ambiguity since this data may never be readable by the user. Unix and VSOCK sockets have similar issues, but fixing them is outside the scope of this patch as it would require more intrusive changes. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dfdc158ab88c..829b281d6c9c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -97,6 +97,8 @@ struct sk_psock { struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; + /** @msg_tot_len: Total bytes queued in ingress_msg list. */ + u32 msg_tot_len; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -321,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } +static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock) +{ + /* Used by ioctl to read msg_tot_len only; lock-free for performance */ + return READ_ONCE(psock->msg_tot_len); +} + +static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff) +{ + /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock(). + * ingress_lock should be held to prevent concurrent updates to msg_tot_len + */ + WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff); +} + +static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_msg_len_add_locked(psock, diff); + spin_unlock_bh(&psock->ingress_lock); +} + static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -329,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); + sk_psock_msg_len_add_locked(psock, msg->sg.size); ret = true; } else { sk_msg_free(psock->sk, msg); @@ -345,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); - if (msg) + if (msg) { list_del(&msg->list); + sk_psock_msg_len_add_locked(psock, -msg->sg.size); + } spin_unlock_bh(&psock->ingress_lock); return msg; } +static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock) +{ + return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); +} + static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); - msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + msg = sk_psock_peek_msg_locked(psock); spin_unlock_bh(&psock->ingress_lock); return msg; } @@ -523,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +/* for tcp only, sk is locked */ +static inline ssize_t sk_psock_msg_inq(struct sock *sk) +{ + struct sk_psock *psock; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + inq = sk_psock_get_msg_len_nolock(psock); + sk_psock_put(sk, psock); + } + return inq; +} + +/* for udp only, sk is not locked */ +static inline ssize_t sk_msg_first_len(struct sock *sk) +{ + struct sk_psock *psock; + struct sk_msg *msg; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + spin_lock_bh(&psock->ingress_lock); + msg = sk_psock_peek_msg_locked(psock); + if (msg) + inq = msg->sg.size; + spin_unlock_bh(&psock->ingress_lock); + sk_psock_put(sk, psock); + } + return inq; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) -- cgit v1.2.3 From ae23bc81ddf7c17b663c4ed1b21e35527b0a7131 Mon Sep 17 00:00:00 2001 From: Guillaume Gonnet Date: Tue, 27 Jan 2026 17:02:00 +0100 Subject: bpf: Fix tcx/netkit detach permissions when prog fd isn't given This commit fixes a security issue where BPF_PROG_DETACH on tcx or netkit devices could be executed by any user when no program fd was provided, bypassing permission checks. The fix adds a capability check for CAP_NET_ADMIN or CAP_SYS_ADMIN in this case. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Signed-off-by: Guillaume Gonnet Link: https://lore.kernel.org/r/20260127160200.10395-1-ggonnet.linux@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/bpf_mprog.h | 10 ++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4427c6e98331..9272a237cced 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3362,6 +3362,11 @@ static inline void bpf_prog_report_arena_violation(bool write, unsigned long add } #endif /* CONFIG_BPF_SYSCALL */ +static inline bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h index 929225f7b095..0b9f4caeeb0a 100644 --- a/include/linux/bpf_mprog.h +++ b/include/linux/bpf_mprog.h @@ -340,4 +340,14 @@ static inline bool bpf_mprog_supported(enum bpf_prog_type type) return false; } } + +static inline bool bpf_mprog_detach_empty(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + return bpf_net_capable(); + default: + return false; + } +} #endif /* __BPF_MPROG_H */ -- cgit v1.2.3 From 4be42c92220128b3128854a2d6b0f0ad0bcedbdb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:02 +0100 Subject: ftrace,bpf: Remove FTRACE_OPS_FL_JMP ftrace_ops flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment the we allow the jmp attach only for ftrace_ops that has FTRACE_OPS_FL_JMP set. This conflicts with following changes where we use single ftrace_ops object for all direct call sites, so all could be be attached via just call or jmp. We already limit the jmp attach support with config option and bit (LSB) set on the trampoline address. It turns out that's actually enough to limit the jmp attach for architecture and only for chosen addresses (with LSB bit set). Each user of register_ftrace_direct or modify_ftrace_direct can set the trampoline bit (LSB) to indicate it has to be attached by jmp. The bpf trampoline generation code uses trampoline flags to generate jmp-attach specific code and ftrace inner code uses the trampoline bit (LSB) to handle return from jmp attachment, so there's no harm to remove the FTRACE_OPS_FL_JMP bit. The fexit/fmodret performance stays the same (did not drop), current code: fentry : 77.904 ± 0.546M/s fexit : 62.430 ± 0.554M/s fmodret : 66.503 ± 0.902M/s with this change: fentry : 80.472 ± 0.061M/s fexit : 63.995 ± 0.127M/s fmodret : 67.362 ± 0.175M/s Fixes: 25e4e3565d45 ("ftrace: Introduce FTRACE_OPS_FL_JMP") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-2-jolsa@kernel.org --- include/linux/ftrace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3a8989e3268..cc869c59c1a6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,7 +359,6 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), - FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS -- cgit v1.2.3 From 0e860d07c29d70205d5ad48456ac7a133ccfb293 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:04 +0100 Subject: ftrace: Export some of hash related functions We are going to use these functions in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-4-jolsa@kernel.org --- include/linux/ftrace.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cc869c59c1a6..a77b57a4aba2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -82,6 +82,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_func_entry; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -405,6 +406,14 @@ enum ftrace_ops_cmd { typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE + +#define FTRACE_HASH_DEFAULT_BITS 10 + +struct ftrace_hash *alloc_ftrace_hash(int size_bits); +void free_ftrace_hash(struct ftrace_hash *hash); +struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, + unsigned long ip, unsigned long direct); + /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { struct ftrace_hash __rcu *notrace_hash; -- cgit v1.2.3 From 05dc5e9c1fe156fd9dddc4c2f81e8fc6c7e50eb5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:05 +0100 Subject: ftrace: Add update_ftrace_direct_add function Adding update_ftrace_direct_add function that adds all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current register_ftrace_direct is - hash argument that allows to register multiple ip -> direct entries at once - we can call update_ftrace_direct_add multiple times on the same ftrace_ops object, becase after first registration with register_ftrace_function_nolock, it uses ftrace_update_ops to update the ftrace_ops object This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-5-jolsa@kernel.org --- include/linux/ftrace.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a77b57a4aba2..43a6cfaf5aae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -543,6 +543,8 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); + void ftrace_stub_direct_tramp(void); #else @@ -569,6 +571,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l return -ENODEV; } +static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called -- cgit v1.2.3 From 8d2c1233f37149e4d1223d06ca7054a7f88c0a24 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:06 +0100 Subject: ftrace: Add update_ftrace_direct_del function Adding update_ftrace_direct_del function that removes all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current unregister_ftrace_direct is - hash argument that allows to unregister multiple ip -> direct entries at once - we can call update_ftrace_direct_del multiple times on the same ftrace_ops object, becase we do not need to unregister all entries at once, we can do it gradualy with the help of ftrace_update_ops function This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-6-jolsa@kernel.org --- include/linux/ftrace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 43a6cfaf5aae..cfb957731433 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -544,6 +544,7 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); void ftrace_stub_direct_tramp(void); @@ -576,6 +577,11 @@ static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called -- cgit v1.2.3 From e93672f770d72f4c33d1dd9fb0633f95948c0cc9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:07 +0100 Subject: ftrace: Add update_ftrace_direct_mod function Adding update_ftrace_direct_mod function that modifies all entries (ip -> direct) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current modify_ftrace_direct is: - hash argument that allows to modify multiple ip -> direct entries at once This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-7-jolsa@kernel.org --- include/linux/ftrace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cfb957731433..b8461e541b9d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -545,6 +545,7 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock); void ftrace_stub_direct_tramp(void); @@ -582,6 +583,11 @@ static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called -- cgit v1.2.3 From 7d0452497c292153e690652e6df218fead21185f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:08 +0100 Subject: bpf: Add trampoline ip hash table Following changes need to lookup trampoline based on its ip address, adding hash table for that. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251230145010.103439-8-jolsa@kernel.org --- include/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9272a237cced..5524f9429e76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1329,14 +1329,17 @@ struct bpf_tramp_image { }; struct bpf_trampoline { - /* hlist for trampoline_table */ - struct hlist_node hlist; + /* hlist for trampoline_key_table */ + struct hlist_node hlist_key; + /* hlist for trampoline_ip_table */ + struct hlist_node hlist_ip; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; + unsigned long ip; struct { struct btf_func_model model; void *addr; -- cgit v1.2.3 From 956747efd82aa60e0ac3e6aef2b9a17adda6f3b1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:09 +0100 Subject: ftrace: Factor ftrace_ops ops_func interface We are going to remove "ftrace_ops->private == bpf_trampoline" setup in following changes. Adding ip argument to ftrace_ops_func_t callback function, so we can use it to look up the trampoline. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-9-jolsa@kernel.org --- include/linux/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b8461e541b9d..705db0a6d995 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -403,7 +403,7 @@ enum ftrace_ops_cmd { * Negative on failure. The return value is dependent on the * callback. */ -typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, unsigned long ip, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From 0f0c332992b8a5d2ae7b611b94c4e02ef8d54b97 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:07 +0100 Subject: bpf: Allow sleepable programs to use tail calls Allowing sleepable programs to use tail calls. Making sure we can't mix sleepable and non-sleepable bpf programs in tail call map (BPF_MAP_TYPE_PROG_ARRAY) and allowing it to be used in sleepable programs. Sleepable programs can be preempted and sleep which might bring new source of race conditions, but both direct and indirect tail calls should not be affected. Direct tail calls work by patching direct jump to callee into bpf caller program, so no problem there. We atomically switch from nop to jump instruction. Indirect tail call reads the callee from the map and then jumps to it. The callee bpf program can't disappear (be released) from the caller, because it is executed under rcu lock (rcu_read_lock_trace). Signed-off-by: Jiri Olsa Acked-by: Leon Hwang Link: https://lore.kernel.org/r/20260130081208.1130204-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5524f9429e76..3b0ceb759075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -287,6 +287,7 @@ struct bpf_map_owner { enum bpf_prog_type type; bool jited; bool xdp_has_frags; + bool sleepable; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; enum bpf_attach_type expected_attach_type; -- cgit v1.2.3 From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:48 +0800 Subject: bpf: Add bpf_jit_supports_fsession() The added fsession does not prevent running on those architectures, that haven't added fsession support. For example, try to run fsession tests on arm64: test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec test_fsession_basic:PASS:fsession_attach 0 nsec check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14) In order to prevent such errors, add bpf_jit_supports_fsession() to guard those architectures. Fixes: 2d419c44658f ("bpf: add fsession support") Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..4e1cb4f91f49 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); +bool bpf_jit_supports_fsession(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); -- cgit v1.2.3 From e3aa56b3ac175bccb8fe60d652a3df2ea6a68a1e Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:49 +0800 Subject: bpf, arm64: Add fsession support Implement fsession support in the arm64 BPF JIT trampoline. Extend the trampoline stack layout to store function metadata and session cookies, and pass the appropriate metadata to fentry and fexit programs. This mirrors the existing x86 behavior and enables session cookies on arm64. Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b0ceb759075..cd9b96434904 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2196,13 +2196,18 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link) +{ + return link->link.prog->call_session_cookie; +} + static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) { struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; int cnt = 0; for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (fentries.links[i]->link.prog->call_session_cookie) + if (bpf_prog_calls_session_cookie(fentries.links[i])) cnt++; } -- cgit v1.2.3 From b2a0aa3a87396483b468b7c81be2fddb29171d74 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:58 -0800 Subject: bpf: Clear singular ids for scalars in is_state_visited() The verifier assigns ids to scalar registers/stack slots when they are linked through a mov or stack spill/fill instruction. These ids are later used to propagate newly found bounds from one register to all registers that share the same id. The verifier also compares the ids of these registers in current state and cached state when making pruning decisions. When an ID becomes singular (i.e., only a single register or stack slot has that ID), it can no longer participate in bounds propagation. During comparisons between current and cached states for pruning decisions, however, such stale IDs can prevent pruning of otherwise equivalent states. Find and clear all singular ids before caching a state in is_state_visited(). struct bpf_idset which is currently unused has been repurposed for this use case. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8355b585cd18..746025df82c8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -697,8 +697,11 @@ struct bpf_idmap { }; struct bpf_idset { - u32 count; - u32 ids[BPF_ID_MAP_SIZE]; + u32 num_ids; + struct { + u32 id; + u32 cnt; + } entries[BPF_ID_MAP_SIZE]; }; /* see verifier.c:compute_scc_callchain() */ -- cgit v1.2.3 From 9d21199842247ab05c675fb9b6c6ca393a5c0024 Mon Sep 17 00:00:00 2001 From: Tianci Cao Date: Wed, 4 Feb 2026 19:15:02 +0800 Subject: bpf: Add bitwise tracking for BPF_END This patch implements bitwise tracking (tnum analysis) for BPF_END (byte swap) operation. Currently, the BPF verifier does not track value for BPF_END operation, treating the result as completely unknown. This limits the verifier's ability to prove safety of programs that perform endianness conversions, which are common in networking code. For example, the following code pattern for port number validation: int test(struct pt_regs *ctx) { __u64 x = bpf_get_prandom_u32(); x &= 0x3f00; // Range: [0, 0x3f00], var_off: (0x0; 0x3f00) x = bswap16(x); // Should swap to range [0, 0x3f], var_off: (0x0; 0x3f) if (x > 0x3f) goto trap; return 0; trap: return *(u64 *)NULL; // Should be unreachable } Currently generates verifier output: 1: (54) w0 &= 16128 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=16128,var_off=(0x0; 0x3f00)) 2: (d7) r0 = bswap16 r0 ; R0=scalar() 3: (25) if r0 > 0x3f goto pc+2 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=63,var_off=(0x0; 0x3f)) Without this patch, even though the verifier knows `x` has certain bits set, after bswap16, it loses all tracking information and treats port as having a completely unknown value [0, 65535]. According to the BPF instruction set[1], there are 3 kinds of BPF_END: 1. `bswap(16|32|64)`: opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) - do unconditional swap 2. `le(16|32|64)`: opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) - on big-endian: do swap - on little-endian: truncation (16/32-bit) or no-op (64-bit) 3. `be(16|32|64)`: opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) - on little-endian: do swap - on big-endian: truncation (16/32-bit) or no-op (64-bit) Since BPF_END operations are inherently bit-wise permutations, tnum (bitwise tracking) offers the most efficient and precise mechanism for value analysis. By implementing `tnum_bswap16`, `tnum_bswap32`, and `tnum_bswap64`, we can derive exact `var_off` values concisely, directly reflecting the bit-level changes. Here is the overview of changes: 1. In `tnum_bswap(16|32|64)` (kernel/bpf/tnum.c): Call `swab(16|32|64)` function on the value and mask of `var_off`, and do truncation for 16/32-bit cases. 2. In `adjust_scalar_min_max_vals` (kernel/bpf/verifier.c): Call helper function `scalar_byte_swap`. - Only do byte swap when * alu64 (unconditional swap) OR * switching between big-endian and little-endian machines. - If need do byte swap: * Firstly call `tnum_bswap(16|32|64)` to update `var_off`. * Then reset the bound since byte swap scrambles the range. - For 16/32-bit cases, truncate dst register to match the swapped size. This enables better verification of networking code that frequently uses byte swaps for protocol processing, reducing false positive rejections. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Yazhou Tang Signed-off-by: Yazhou Tang Signed-off-by: Tianci Cao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204111503.77871-2-ziye@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index c52b862dad45..fa4654ffb621 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -63,6 +63,11 @@ struct tnum tnum_union(struct tnum t1, struct tnum t2); /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); +/* Swap the bytes of a tnum */ +struct tnum tnum_bswap16(struct tnum a); +struct tnum tnum_bswap32(struct tnum a); +struct tnum tnum_bswap64(struct tnum a); + /* Returns true if @a is a known constant */ static inline bool tnum_is_const(struct tnum a) { -- cgit v1.2.3 From 7a433e519364c3c19643e5c857f4fbfaebec441c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 4 Feb 2026 07:17:37 -0800 Subject: bpf: Support negative offsets, BPF_SUB, and alu32 for linked register tracking Previously, the verifier only tracked positive constant deltas between linked registers using BPF_ADD. This limitation meant patterns like: r1 = r0; r1 += -4; if r1 s>= 0 goto l0_%=; // r1 >= 0 implies r0 >= 4 // verifier couldn't propagate bounds back to r0 if r0 != 0 goto l0_%=; r0 /= 0; // Verifier thinks this is reachable l0_%=: Similar limitation exists for 32-bit registers. With this change, the verifier can now track negative deltas in reg->off enabling bound propagation for the above pattern. For alu32, we make sure the destination register has the upper 32 bits as 0s before creating the link. BPF_ADD_CONST is split into BPF_ADD_CONST64 and BPF_ADD_CONST32, the latter is used in case of alu32 and sync_linked_regs uses this to zext the result if known_reg has this flag. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204151741.2678118-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 746025df82c8..ef8e45a362d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -147,8 +147,12 @@ struct bpf_reg_state { * registers. Example: * r1 = r2; both will have r1->id == r2->id == N * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r3 = r2; both will have r3->id == r2->id == N + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 */ -#define BPF_ADD_CONST (1U << 31) +#define BPF_ADD_CONST64 (1U << 31) +#define BPF_ADD_CONST32 (1U << 30) +#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and -- cgit v1.2.3 From 0ccef7079ea8d5f7b896b14be7e400022ff240c6 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:28:59 -0800 Subject: bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage A later bpf_local_storage refactor will acquire all locks before performing any update. To simplified the number of locks needed to take in bpf_local_storage_map_update(), determine the bucket based on the local_storage an selem belongs to instead of the selem pointer. Currently, when a new selem needs to be created to replace the old selem in bpf_local_storage_map_update(), locks of both buckets need to be acquired to prevent racing. This can be simplified if the two selem belongs to the same bucket so that only one bucket needs to be locked. Therefore, instead of hashing selem, hashing the local_storage pointer the selem belongs. Performance wise, this is slightly better as update now requires locking one bucket. It should not change the level of contention on one bucket as the pointers to local storages of selems in a map are just as unique as pointers to selems. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-2-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 66432248cd81..2638487425b8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -179,6 +179,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * -- cgit v1.2.3 From fd103ffc57c9a3b8b76bc852ffae5eb630a6ded4 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:01 -0800 Subject: bpf: Convert bpf_selem_link_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_link_map() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-4-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 2638487425b8..709506e982a6 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -178,9 +178,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem); +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, -- cgit v1.2.3 From 403e935f915896243ff93f9a2ff44e5bb6619032 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:02 -0800 Subject: bpf: Convert bpf_selem_unlink to failable To prepare changing both bpf_local_storage_map_bucket::lock and bpf_local_storage::lock to rqspinlock, convert bpf_selem_unlink() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Open code bpf_selem_unlink_storage() in the only caller, bpf_selem_unlink(), since unlink_map and unlink_storage must be done together after all the necessary locks are acquired. For bpf_local_storage_map_free(), ignore the return from bpf_selem_unlink() for now. A later patch will allow it to unlink selems even when failing to acquire locks. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-5-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 709506e982a6..f74e0f7656a1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, -- cgit v1.2.3 From 8dabe34b9d5b1135b084268396ed2267107e64c1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:03 -0800 Subject: bpf: Change local_storage->lock and b->lock to rqspinlock Change bpf_local_storage::lock and bpf_local_storage_map_bucket::lock from raw_spin_lock to rqspinlock. Finally, propagate errors from raw_res_spin_lock_irqsave() to syscall return or BPF helper return. In bpf_local_storage_destroy(), ignore return from raw_res_spin_lock_irqsave() for now. A later patch will correctly handle errors correctly in bpf_local_storage_destroy() so that it can unlink selems even when failing to acquire locks. For __bpf_local_storage_map_cache(), instead of handling the error, skip updating the cache. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-6-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index f74e0f7656a1..fa50b7afee18 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -15,12 +15,13 @@ #include #include #include +#include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 struct bpf_local_storage_map_bucket { struct hlist_head list; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* Thp map is not the primary owner of a bpf_local_storage_elem. @@ -94,7 +95,7 @@ struct bpf_local_storage { * bpf_local_storage_elem. */ struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + rqspinlock_t lock; /* Protect adding/removing from the "list" */ bool use_kmalloc_nolock; }; -- cgit v1.2.3 From 3417dffb58331d1e7e4f3e30ec95cc0f8114ff10 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:06 -0800 Subject: bpf: Remove unused percpu counter from bpf_local_storage_map_free Percpu locks have been removed from cgroup and task local storage. Now that all local storage no longer use percpu variables as locks preventing recursion, there is no need to pass them to bpf_local_storage_map_free(). Remove the argument from the function. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-9-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fa50b7afee18..fba3354988d3 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -166,8 +166,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter); + struct bpf_local_storage_cache *cache); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, -- cgit v1.2.3 From c8be3da14718f1e732afcb61e8ee5b78e8d93808 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:07 -0800 Subject: bpf: Prepare for bpf_selem_unlink_nofail() The next patch will introduce bpf_selem_unlink_nofail() to handle rqspinlock errors. bpf_selem_unlink_nofail() will allow an selem to be partially unlinked from map or local storage. Save memory allocation method in selem so that later an selem can be correctly freed even when SDATA(selem)->smap is init to NULL. In addition, keep track of memory charge to the owner in local storage so that later bpf_selem_unlink_nofail() can return the correct memory charge to the owner. Updating local_storage->mem_charge is protected by local_storage->lock. Finally, extract miscellaneous tasks performed when unlinking an selem from local_storage into bpf_selem_unlink_storage_nolock_misc(). It will be reused by bpf_selem_unlink_nofail(). This patch also takes the chance to remove local_storage->smap, which is no longer used since commit f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage"). Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-10-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fba3354988d3..a34ed7fa81d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,7 +80,8 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; - /* 8 bytes hole */ + bool use_kmalloc_nolock; + /* 7 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -89,13 +90,13 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; - struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ + u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ bool use_kmalloc_nolock; }; -- cgit v1.2.3 From 5d800f87d0a5ea1b156c47a4b9fd128479335153 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:08 -0800 Subject: bpf: Support lockless unlink when freeing map or local storage Introduce bpf_selem_unlink_nofail() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_nofail() is to allow an selem to be partially linked and use atomic operation on a bit field, selem->state, to determine when and who can free the selem if any unlink under lock fails. An selem initially is fully linked to a map and a local storage. Under normal circumstances, bpf_selem_unlink_nofail() will be able to grab locks and unlink a selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then free it after an RCU grace period. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the caller. Then, after both map_free() and destroy() see the selem and the state becomes SELEM_UNLINKED, one of two racing caller can succeed in cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no double free or memory leak. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is done only when the owner is still present in map_free(), block destroy() from returning until there is no pending map_free(). Since smap may not be valid in destroy(), bpf_selem_unlink_nofail() skips bpf_selem_unlink_storage_nolock_misc() when called from destroy(). This is okay as bpf_local_storage_destroy() will return the remaining amount of memory charge tracked by mem_charge to the owner to uncharge. It is also safe to skip clearing local_storage->owner and owner_storage as the owner is being freed and no users or bpf programs should be able to reference the owner and using local_storage. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34ed7fa81d8..69a5d8aa765d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -68,6 +68,11 @@ struct bpf_local_storage_data { u8 data[] __aligned(8); }; +#define SELEM_MAP_UNLINKED (1 << 0) +#define SELEM_STORAGE_UNLINKED (1 << 1) +#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED) +#define SELEM_TOFREE (1 << 2) + /* Linked to bpf_local_storage and bpf_local_storage_map */ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ @@ -80,8 +85,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t state; bool use_kmalloc_nolock; - /* 7 bytes hole */ + /* 3 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -97,6 +103,7 @@ struct bpf_local_storage { struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ + refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ bool use_kmalloc_nolock; }; -- cgit v1.2.3 From 0be08389c7f2f9db0194ee666d2e4870886e6ffb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:09 -0800 Subject: bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy} Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}() properly by switching to bpf_selem_unlink_nofail(). Both functions iterate their own RCU-protected list of selems and call bpf_selem_unlink_nofail(). In map_free(), to prevent infinite loop when both map_free() and destroy() fail to remove a selem from b->list (extremely unlikely), switch to hlist_for_each_entry_rcu(). In destroy(), also switch to hlist_for_each_entry_rcu() since we no longer iterate local_storage->list under local_storage->lock. bpf_selem_unlink() now becomes dedicated to helpers and syscalls paths so reuse_now should always be false. Remove it from the argument and hardcode it. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-12-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 69a5d8aa765d..85efa9772530 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -171,7 +171,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, return SDATA(selem); } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); @@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, -- cgit v1.2.3