From a3125bc01884431d30d731461634c8295b6f0529 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 4 Mar 2026 16:32:27 +0800 Subject: bpf: Reset register ID for BPF_END value tracking When a register undergoes a BPF_END (byte swap) operation, its scalar value is mutated in-place. If this register previously shared a scalar ID with another register (e.g., after an `r1 = r0` assignment), this tie must be broken. Currently, the verifier misses resetting `dst_reg->id` to 0 for BPF_END. Consequently, if a conditional jump checks the swapped register, the verifier incorrectly propagates the learned bounds to the linked register, leading to false confidence in the linked register's value and potentially allowing out-of-bounds memory accesses. Fix this by explicitly resetting `dst_reg->id` to 0 in the BPF_END case to break the scalar tie, similar to how BPF_NEG handles it via `__mark_reg_known`. Fixes: 9d2119984224 ("bpf: Add bitwise tracking for BPF_END") Closes: https://lore.kernel.org/bpf/AMBPR06MB108683CFEB1CB8D9E02FC95ECF17EA@AMBPR06MB10868.eurprd06.prod.outlook.com/ Link: https://lore.kernel.org/bpf/4be25f7442a52244d0dd1abb47bc6750e57984c9.camel@gmail.com/ Reported-by: Guillaume Laporte Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260304083228.142016-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 159b25f8269d..df22bfc572e2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15910,6 +15910,13 @@ static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *ins /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ bool need_bswap = alu64 || (to_le == is_big_endian); + /* + * If the register is mutated, manually reset its scalar ID to break + * any existing ties and avoid incorrect bounds propagation. + */ + if (need_bswap || insn->imm == 16 || insn->imm == 32) + dst_reg->id = 0; + if (need_bswap) { if (insn->imm == 16) dst_reg->var_off = tnum_bswap16(dst_reg->var_off); -- cgit v1.2.3 From 2321a9596d2260310267622e0ad8fbfa6f95378f Mon Sep 17 00:00:00 2001 From: Sachin Kumar Date: Mon, 9 Mar 2026 18:25:42 +0000 Subject: bpf: Fix constant blinding for PROBE_MEM32 stores BPF_ST | BPF_PROBE_MEM32 immediate stores are not handled by bpf_jit_blind_insn(), allowing user-controlled 32-bit immediates to survive unblinded into JIT-compiled native code when bpf_jit_harden >= 1. The root cause is that convert_ctx_accesses() rewrites BPF_ST|BPF_MEM to BPF_ST|BPF_PROBE_MEM32 for arena pointer stores during verification, before bpf_jit_blind_constants() runs during JIT compilation. The blinding switch only matches BPF_ST|BPF_MEM (mode 0x60), not BPF_ST|BPF_PROBE_MEM32 (mode 0xa0). The instruction falls through unblinded. Add BPF_ST|BPF_PROBE_MEM32 cases to bpf_jit_blind_insn() alongside the existing BPF_ST|BPF_MEM cases. The blinding transformation is identical: load the blinded immediate into BPF_REG_AX via mov+xor, then convert the immediate store to a register store (BPF_STX). The rewritten STX instruction must preserve the BPF_PROBE_MEM32 mode so the architecture JIT emits the correct arena addressing (R12-based on x86-64). Cannot use the BPF_STX_MEM() macro here because it hardcodes BPF_MEM mode; construct the instruction directly instead. Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Reviewed-by: Puranjay Mohan Reviewed-by: Emil Tsalapatis Signed-off-by: Sachin Kumar Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/Y6IT5VvNRchPBLI5D7JZHBzZrU9rb0ycRJPJzJSXGj7kJlX8RJwZFSM2YZjcDxoQKABkxt1T8Os2gi23PYyFuQe6KkZGWVyfz8K5afdy9ak=@protonmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3ece2da55625..9e126be33755 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1422,6 +1422,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); break; + + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ + from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + /* + * Cannot use BPF_STX_MEM() macro here as it + * hardcodes BPF_MEM mode, losing PROBE_MEM32 + * and breaking arena addressing in the JIT. + */ + *to++ = (struct bpf_insn) { + .code = BPF_STX | BPF_PROBE_MEM32 | + BPF_SIZE(from->code), + .dst_reg = from->dst_reg, + .src_reg = BPF_REG_AX, + .off = from->off, + }; + break; } out: return to - to_buff; -- cgit v1.2.3 From 4b9ce671960627b2505b3f64742544ae9801df97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Mar 2026 13:55:46 +0100 Subject: perf: Make sure to use pmu_ctx->pmu for groups Oliver reported that x86_pmu_del() ended up doing an out-of-bound memory access when group_sched_in() fails and needs to roll back. This *should* be handled by the transaction callbacks, but he found that when the group leader is a software event, the transaction handlers of the wrong PMU are used. Despite the move_group case in perf_event_open() and group_sched_in() using pmu_ctx->pmu. Turns out, inherit uses event->pmu to clone the events, effectively undoing the move_group case for all inherited contexts. Fix this by also making inherit use pmu_ctx->pmu, ensuring all inherited counters end up in the same pmu context. Similarly, __perf_event_read() should use equally use pmu_ctx->pmu for the group case. Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: Oliver Rosenberg Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ian Rogers Link: https://patch.msgid.link/20260309133713.GB606826@noisy.programming.kicks-ass.net --- kernel/events/core.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1f5699b339ec..89b40e439717 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,7 +4813,7 @@ static void __perf_event_read(void *info) struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - struct pmu *pmu = event->pmu; + struct pmu *pmu; /* * If this is a task context, we need to check whether it is @@ -4825,7 +4825,7 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - raw_spin_lock(&ctx->lock); + guard(raw_spinlock)(&ctx->lock); ctx_time_update_event(ctx, event); perf_event_update_time(event); @@ -4833,25 +4833,22 @@ static void __perf_event_read(void *info) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; + return; if (!data->group) { - pmu->read(event); + perf_pmu_read(event); data->ret = 0; - goto unlock; + return; } + pmu = event->pmu_ctx->pmu; pmu->start_txn(pmu, PERF_PMU_TXN_READ); - pmu->read(event); - + perf_pmu_read(event); for_each_sibling_event(sub, event) perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); - -unlock: - raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event, bool self) @@ -14744,7 +14741,7 @@ inherit_event(struct perf_event *parent_event, get_ctx(child_ctx); child_event->ctx = child_ctx; - pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + pmu_ctx = find_get_pmu_context(parent_event->pmu_ctx->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); return ERR_CAST(pmu_ctx); -- cgit v1.2.3 From 6f770b73d0311a5b099277653199bb6421c4fed2 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 15 Mar 2026 17:27:49 +0900 Subject: dma: swiotlb: add KMSAN annotations to swiotlb_bounce() When a device performs DMA to a bounce buffer, KMSAN is unaware of the write and does not mark the data as initialized. When swiotlb_bounce() later copies the bounce buffer back to the original buffer, memcpy propagates the uninitialized shadow to the original buffer, causing false positive uninit-value reports. Fix this by calling kmsan_unpoison_memory() on the bounce buffer before copying it back in the DMA_FROM_DEVICE path, so that memcpy naturally propagates initialized shadow to the destination. Suggested-by: Alexander Potapenko Link: https://lore.kernel.org/CAG_fn=WUGta-paG1BgsGRoAR+fmuCgh3xo=R3XdzOt_-DqSdHw@mail.gmail.com/ Fixes: 7ade4f10779c ("dma: kmsan: unpoison DMA mappings") Signed-off-by: Shigeru Yoshida Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260315082750.2375581-1-syoshida@redhat.com --- kernel/dma/swiotlb.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..9fd73700ddcf 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -901,10 +902,19 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size local_irq_save(flags); page = pfn_to_page(pfn); - if (dir == DMA_TO_DEVICE) + if (dir == DMA_TO_DEVICE) { + /* + * Ideally, kmsan_check_highmem_page() + * could be used here to detect infoleaks, + * but callers may map uninitialized buffers + * that will be written by the device, + * causing false positives. + */ memcpy_from_page(vaddr, page, offset, sz); - else + } else { + kmsan_unpoison_memory(vaddr, sz); memcpy_to_page(page, offset, vaddr, sz); + } local_irq_restore(flags); size -= sz; @@ -913,8 +923,15 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size offset = 0; } } else if (dir == DMA_TO_DEVICE) { + /* + * Ideally, kmsan_check_memory() could be used here to detect + * infoleaks (uninitialized data being sent to device), but + * callers may map uninitialized buffers that will be written + * by the device, causing false positives. + */ memcpy(vaddr, phys_to_virt(orig_addr), size); } else { + kmsan_unpoison_memory(vaddr, size); memcpy(phys_to_virt(orig_addr), vaddr, size); } } -- cgit v1.2.3 From 146bd2a87a65aa407bb17fac70d8d583d19aba06 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 12 Mar 2026 13:53:07 -0700 Subject: bpf: Release module BTF IDR before module unload Gregory reported in [0] that the global_map_resize test when run in repeatedly ends up failing during program load. This stems from the fact that BTF reference has not dropped to zero after the previous run's module is unloaded, and the older module's BTF is still discoverable and visible. Later, in libbpf, load_module_btfs() will find the ID for this stale BTF, open its fd, and then it will be used during program load where later steps taking module reference using btf_try_get_module() fail since the underlying module for the BTF is gone. Logically, once a module is unloaded, it's associated BTF artifacts should become hidden. The BTF object inside the kernel may still remain alive as long its reference counts are alive, but it should no longer be discoverable. To fix this, let us call btf_free_id() from the MODULE_STATE_GOING case for the module unload to free the BTF associated IDR entry, and disable its discovery once module unload returns to user space. If a race happens during unload, the outcome is non-deterministic anyway. However, user space should be able to rely on the guarantee that once it has synchronously established a successful module unload, no more stale artifacts associated with this module can be obtained subsequently. Note that we must be careful to not invoke btf_free_id() in btf_put() when btf_is_module() is true now. There could be a window where the module unload drops a non-terminal reference, frees the IDR, but the same ID gets reused and the second unconditional btf_free_id() ends up releasing an unrelated entry. To avoid a special case for btf_is_module() case, set btf->id to zero to make btf_free_id() idempotent, such that we can unconditionally invoke it from btf_put(), and also from the MODULE_STATE_GOING case. Since zero is an invalid IDR, the idr_remove() should be a noop. Note that we can be sure that by the time we reach final btf_put() for btf_is_module() case, the btf_free_id() is already done, since the module itself holds the BTF reference, and it will call this function for the BTF before dropping its own reference. [0]: https://lore.kernel.org/bpf/cover.1773170190.git.grbell@redhat.com Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Acked-by: Martin KaFai Lau Suggested-by: Martin KaFai Lau Reported-by: Gregory Bell Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260312205307.1346991-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4872d2a6c42d..71f9143fe90f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1787,7 +1787,16 @@ static void btf_free_id(struct btf *btf) * of the _bh() version. */ spin_lock_irqsave(&btf_idr_lock, flags); - idr_remove(&btf_idr, btf->id); + if (btf->id) { + idr_remove(&btf_idr, btf->id); + /* + * Clear the id here to make this function idempotent, since it will get + * called a couple of times for module BTFs: on module unload, and then + * the final btf_put(). btf_alloc_id() starts IDs with 1, so we can use + * 0 as sentinel value. + */ + WRITE_ONCE(btf->id, 0); + } spin_unlock_irqrestore(&btf_idr_lock, flags); } @@ -8115,7 +8124,7 @@ static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) { const struct btf *btf = filp->private_data; - seq_printf(m, "btf_id:\t%u\n", btf->id); + seq_printf(m, "btf_id:\t%u\n", READ_ONCE(btf->id)); } #endif @@ -8197,7 +8206,7 @@ int btf_get_info_by_fd(const struct btf *btf, if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; - info.id = btf->id; + info.id = READ_ONCE(btf->id); ubtf = u64_to_user_ptr(info.btf); btf_copy = min_t(u32, btf->data_size, info.btf_size); if (copy_to_user(ubtf, btf->data, btf_copy)) @@ -8260,7 +8269,7 @@ int btf_get_fd_by_id(u32 id) u32 btf_obj_id(const struct btf *btf) { - return btf->id; + return READ_ONCE(btf->id); } bool btf_is_kernel(const struct btf *btf) @@ -8382,6 +8391,13 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, if (btf_mod->module != module) continue; + /* + * For modules, we do the freeing of BTF IDR as soon as + * module goes away to disable BTF discovery, since the + * btf_try_get_module() on such BTFs will fail. This may + * be called again on btf_put(), but it's ok to do so. + */ + btf_free_id(btf_mod->btf); list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); -- cgit v1.2.3 From eca58535b154e6951327319afda94ac80eae7dc3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:45 +0200 Subject: dma-debug: Allow multiple invocations of overlapping entries Repeated DMA mappings with DMA_ATTR_CPU_CACHE_CLEAN trigger the following splat. This prevents using the attribute in cases where a DMA region is shared and reused more than seven times. ------------[ cut here ]------------ DMA-API: exceeded 7 overlapping mappings of cacheline 0x000000000438c440 WARNING: kernel/dma/debug.c:467 at add_dma_entry+0x219/0x280, CPU#4: ibv_rc_pingpong/1644 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core CPU: 4 UID: 2733 PID: 1644 Comm: ibv_rc_pingpong Not tainted 6.19.0+ #129 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:add_dma_entry+0x221/0x280 Code: c0 0f 84 f2 fe ff ff 83 e8 01 89 05 6d 99 11 01 e9 e4 fe ff ff 0f 8e 1f ff ff ff 48 8d 3d 07 ef 2d 01 be 07 00 00 00 48 89 e2 <67> 48 0f b9 3a e9 06 ff ff ff 48 c7 c7 98 05 2b 82 c6 05 72 92 28 RSP: 0018:ff1100010e657970 EFLAGS: 00010002 RAX: 0000000000000007 RBX: ff1100010234eb00 RCX: 0000000000000000 RDX: ff1100010e657970 RSI: 0000000000000007 RDI: ffffffff82678660 RBP: 000000000438c440 R08: 0000000000000228 R09: 0000000000000000 R10: 00000000000001be R11: 000000000000089d R12: 0000000000000800 R13: 00000000ffffffef R14: 0000000000000202 R15: ff1100010234eb00 FS: 00007fb15f3f6740(0000) GS:ff110008dcc19000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb15f32d3a0 CR3: 0000000116f59001 CR4: 0000000000373eb0 Call Trace: debug_dma_map_sg+0x1b4/0x390 __dma_map_sg_attrs+0x6d/0x1a0 dma_map_sgtable+0x19/0x30 ib_umem_get+0x284/0x3b0 [ib_uverbs] mlx5_ib_reg_user_mr+0x68/0x2a0 [mlx5_ib] ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc2/0x130 [ib_uverbs] ib_uverbs_cmd_verbs+0xa0b/0xae0 [ib_uverbs] ? ib_uverbs_handler_UVERBS_METHOD_QUERY_PORT_SPEED+0xe0/0xe0 [ib_uverbs] ? mmap_region+0x7a/0xb0 ? do_mmap+0x3b8/0x5c0 ib_uverbs_ioctl+0xa7/0x110 [ib_uverbs] __x64_sys_ioctl+0x14f/0x8b0 ? ksys_mmap_pgoff+0xc5/0x190 do_syscall_64+0x8c/0xbf0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fb15f5e4eed Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 RSP: 002b:00007ffe09a5c540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffe09a5c5d0 RCX: 00007fb15f5e4eed RDX: 00007ffe09a5c5f0 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffe09a5c590 R08: 0000000000000028 R09: 00007ffe09a5c794 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffe09a5c794 R13: 000000000000000c R14: 0000000025a49170 R15: 000000000000000c ---[ end trace 0000000000000000 ]--- Fixes: 61868dc55a11 ("dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN") Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-1-1dde90a7f08b@nvidia.com --- kernel/dma/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 86f87e43438c..be207be74996 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) return overlap; } -static void active_cacheline_inc_overlap(phys_addr_t cln) +static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean) { int overlap = active_cacheline_read_overlap(cln); @@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) /* If we overflowed the overlap counter then we're potentially * leaking dma-mappings. */ - WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, + WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), ACTIVE_CACHELINE_MAX_OVERLAP, &cln); } @@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry, if (rc == -EEXIST) { struct dma_debug_entry *existing; - active_cacheline_inc_overlap(cln); + active_cacheline_inc_overlap(cln, entry->is_cache_clean); existing = radix_tree_lookup(&dma_active_cacheline, cln); /* A lookup failure here after we got -EEXIST is unexpected. */ WARN_ON(!existing); -- cgit v1.2.3 From 9bb0a4d6a4433b75274204b083dac8e515d2007d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:47 +0200 Subject: dma-mapping: Clarify valid conditions for CPU cache line overlap Rename the DMA_ATTR_CPU_CACHE_CLEAN attribute to better reflect that it is debugging aid to inform DMA core code that CPU cache line overlaps are allowed, and refine the documentation describing its use. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-3-1dde90a7f08b@nvidia.com --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index be207be74996..83e1cfe05f08 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -601,7 +601,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES; bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); -- cgit v1.2.3 From e6a58fa2556203a7f6731b4071705dc81cca5ca5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:48 +0200 Subject: dma-mapping: Introduce DMA require coherency attribute The mapping buffers which carry this attribute require DMA coherent system. This means that they can't take SWIOTLB path, can perform CPU cache overlap and doesn't perform cache flushing. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-4-1dde90a7f08b@nvidia.com --- kernel/dma/debug.c | 3 ++- kernel/dma/mapping.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 83e1cfe05f08..0677918f06a8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES; + entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES | + DMA_ATTR_REQUIRE_COHERENT); bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3928a509c44c..6d3dd0bd3a88 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -164,6 +164,9 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) addr = dma_direct_map_phys(dev, phys, size, dir, attrs); @@ -235,6 +238,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return -EOPNOTSUPP; + if (WARN_ON_ONCE(!dev->dma_mask)) return 0; -- cgit v1.2.3 From 2536617f20ddc7c2f4cef59b549aa45d166b03b1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:49 +0200 Subject: dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set DMA_ATTR_REQUIRE_COHERENT indicates that SWIOTLB must not be used. Ensure the SWIOTLB path is declined whenever the DMA direct path is selected. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-5-1dde90a7f08b@nvidia.com --- kernel/dma/direct.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index e89f175e9c2d..6184ff303f08 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -84,7 +84,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); @@ -98,7 +98,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, dma_addr = phys_to_dma(dev, phys); if (unlikely(!dma_capable(dev, dma_addr, size, true)) || dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_swiotlb_active(dev)) + if (is_swiotlb_active(dev) && + !(attrs & DMA_ATTR_REQUIRE_COHERENT)) return swiotlb_map(dev, phys, size, dir, attrs); goto err_overflow; @@ -123,7 +124,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, { phys_addr_t phys; - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) /* nothing to do: uncached and no swiotlb */ return; -- cgit v1.2.3 From 6c2128505f61b504c79a20b89596feba61388112 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Thu, 19 Mar 2026 17:08:08 -0700 Subject: bpf: Fix exception exit lock checking for subprogs process_bpf_exit_full() passes check_lock = !curframe to check_resource_leak(), which is false in cases when bpf_throw() is called from a static subprog. This makes check_resource_leak() to skip validation of active_rcu_locks, active_preempt_locks, and active_irq_id on exception exits from subprogs. At runtime bpf_throw() unwinds the stack via ORC without releasing any user-acquired locks, which may cause various issues as the result. Fix by setting check_lock = true for exception exits regardless of curframe, since exceptions bypass all intermediate frame cleanup. Update the error message prefix to "bpf_throw" for exception exits to distinguish them from normal BPF_EXIT. Fix reject_subprog_with_rcu_read_lock test which was previously passing for the wrong reason. Test program returned directly from the subprog call without closing the RCU section, so the error was triggered by the unclosed RCU lock on normal exit, not by bpf_throw. Update __msg annotations for affected tests to match the new "bpf_throw" error prefix. The spin_lock case is not affected because they are already checked [1] at the call site in do_check_insn() before bpf_throw can run. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/bpf/verifier.c?h=v7.0-rc4#n21098 Assisted-by: Claude:claude-opus-4-6 Fixes: f18b03fabaa9 ("bpf: Implement BPF exceptions") Signed-off-by: Ihor Solodrai Acked-by: Yonghong Song Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260320000809.643798-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df22bfc572e2..5c0e6809024f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20911,7 +20911,8 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, * state when it exits. */ int err = check_resource_leak(env, exception_exit, - !env->cur_state->curframe, + exception_exit || !env->cur_state->curframe, + exception_exit ? "bpf_throw" : "BPF_EXIT instruction in main prog"); if (err) return err; -- cgit v1.2.3 From c77b30bd1dcb61f66c640ff7d2757816210c7cb0 Mon Sep 17 00:00:00 2001 From: Jenny Guanni Qu Date: Wed, 11 Mar 2026 01:11:15 +0000 Subject: bpf: Fix undefined behavior in interpreter sdiv/smod for INT_MIN The BPF interpreter's signed 32-bit division and modulo handlers use the kernel abs() macro on s32 operands. The abs() macro documentation (include/linux/math.h) explicitly states the result is undefined when the input is the type minimum. When DST contains S32_MIN (0x80000000), abs((s32)DST) triggers undefined behavior and returns S32_MIN unchanged on arm64/x86. This value is then sign-extended to u64 as 0xFFFFFFFF80000000, causing do_div() to compute the wrong result. The verifier's abstract interpretation (scalar32_min_max_sdiv) computes the mathematically correct result for range tracking, creating a verifier/interpreter mismatch that can be exploited for out-of-bounds map value access. Introduce abs_s32() which handles S32_MIN correctly by casting to u32 before negating, avoiding signed overflow entirely. Replace all 8 abs((s32)...) call sites in the interpreter's sdiv32/smod32 handlers. s32 is the only affected case -- the s64 division/modulo handlers do not use abs(). Fixes: ec0e2da95f72 ("bpf: Support new signed div/mod instructions.") Acked-by: Yonghong Song Acked-by: Mykyta Yatsenko Signed-off-by: Jenny Guanni Qu Link: https://lore.kernel.org/r/20260311011116.2108005-2-qguanni@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9e126be33755..7b675a451ec8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1757,6 +1757,12 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON +/* Absolute value of s32 without undefined behavior for S32_MIN */ +static u32 abs_s32(s32 x) +{ + return x >= 0 ? (u32)x : -(u32)x; +} + /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1921,8 +1927,8 @@ select_insn: DST = do_div(AX, (u32) SRC); break; case 1: - AX = abs((s32)DST); - AX = do_div(AX, abs((s32)SRC)); + AX = abs_s32((s32)DST); + AX = do_div(AX, abs_s32((s32)SRC)); if ((s32)DST < 0) DST = (u32)-AX; else @@ -1949,8 +1955,8 @@ select_insn: DST = do_div(AX, (u32) IMM); break; case 1: - AX = abs((s32)DST); - AX = do_div(AX, abs((s32)IMM)); + AX = abs_s32((s32)DST); + AX = do_div(AX, abs_s32((s32)IMM)); if ((s32)DST < 0) DST = (u32)-AX; else @@ -1976,8 +1982,8 @@ select_insn: DST = (u32) AX; break; case 1: - AX = abs((s32)DST); - do_div(AX, abs((s32)SRC)); + AX = abs_s32((s32)DST); + do_div(AX, abs_s32((s32)SRC)); if (((s32)DST < 0) == ((s32)SRC < 0)) DST = (u32)AX; else @@ -2003,8 +2009,8 @@ select_insn: DST = (u32) AX; break; case 1: - AX = abs((s32)DST); - do_div(AX, abs((s32)IMM)); + AX = abs_s32((s32)DST); + do_div(AX, abs_s32((s32)IMM)); if (((s32)DST < 0) == ((s32)IMM < 0)) DST = (u32)AX; else -- cgit v1.2.3 From c845894ebd6fb43226b3118d6b017942550910c5 Mon Sep 17 00:00:00 2001 From: Daniel Wade Date: Sat, 14 Mar 2026 13:15:20 +1100 Subject: bpf: Fix unsound scalar forking in maybe_fork_scalars() for BPF_OR maybe_fork_scalars() is called for both BPF_AND and BPF_OR when the source operand is a constant. When dst has signed range [-1, 0], it forks the verifier state: the pushed path gets dst = 0, the current path gets dst = -1. For BPF_AND this is correct: 0 & K == 0. For BPF_OR this is wrong: 0 | K == K, not 0. The pushed path therefore tracks dst as 0 when the runtime value is K, producing an exploitable verifier/runtime divergence that allows out-of-bounds map access. Fix this by passing env->insn_idx (instead of env->insn_idx + 1) to push_stack(), so the pushed path re-executes the ALU instruction with dst = 0 and naturally computes the correct result for any opcode. Fixes: bffacdb80b93 ("bpf: Recognize special arithmetic shift in the verifier") Signed-off-by: Daniel Wade Reviewed-by: Amery Hung Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260314021521.128361-2-danjwade95@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c0e6809024f..62377bcb66fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15999,7 +15999,7 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins else return 0; - branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + branch = push_stack(env, env->insn_idx, env->insn_idx, false); if (IS_ERR(branch)) return PTR_ERR(branch); -- cgit v1.2.3 From bc308be380c136800e1e94c6ce49cb53141d6506 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 19 Mar 2026 22:15:06 +0100 Subject: bpf: Fix sync_linked_regs regarding BPF_ADD_CONST32 zext propagation Jenny reported that in sync_linked_regs() the BPF_ADD_CONST32 flag is checked on known_reg (the register narrowed by a conditional branch) instead of reg (the linked target register created by an alu32 operation). Example case with reg: 1. r6 = bpf_get_prandom_u32() 2. r7 = r6 (linked, same id) 3. w7 += 5 (alu32 -- r7 gets BPF_ADD_CONST32, zero-extended by CPU) 4. if w6 < 0xFFFFFFFC goto safe (narrows r6 to [0xFFFFFFFC, 0xFFFFFFFF]) 5. sync_linked_regs() propagates to r7 but does NOT call zext_32_to_64() 6. Verifier thinks r7 is [0x100000001, 0x100000004] instead of [1, 4] Since known_reg above does not have BPF_ADD_CONST32 set above, zext_32_to_64() is never called on alu32-derived linked registers. This causes the verifier to track incorrect 64-bit bounds, while the CPU correctly zero-extends the 32-bit result. The code checking known_reg->id was correct however (see scalars_alu32_wrap selftest case), but the real fix needs to handle both directions - zext propagation should be done when either register has BPF_ADD_CONST32, since the linked relationship involves a 32-bit operation regardless of which side has the flag. Example case with known_reg (exercised also by scalars_alu32_wrap): 1. r1 = r0; w1 += 0x100 (alu32 -- r1 gets BPF_ADD_CONST32) 2. if r1 > 0x80 - known_reg = r1 (has BPF_ADD_CONST32), reg = r0 (doesn't) Hence, fix it by checking for (reg->id | known_reg->id) & BPF_ADD_CONST32. Moreover, sync_linked_regs() also has a soundness issue when two linked registers used different ALU widths: one with BPF_ADD_CONST32 and the other with BPF_ADD_CONST64. The delta relationship between linked registers assumes the same arithmetic width though. When one register went through alu32 (CPU zero-extends the 32-bit result) and the other went through alu64 (no zero-extension), the propagation produces incorrect bounds. Example: r6 = bpf_get_prandom_u32() // fully unknown if r6 >= 0x100000000 goto out // constrain r6 to [0, U32_MAX] r7 = r6 w7 += 1 // alu32: r7.id = N | BPF_ADD_CONST32 r8 = r6 r8 += 2 // alu64: r8.id = N | BPF_ADD_CONST64 if r7 < 0xFFFFFFFF goto out // narrows r7 to [0xFFFFFFFF, 0xFFFFFFFF] At the branch on r7, sync_linked_regs() runs with known_reg=r7 (BPF_ADD_CONST32) and reg=r8 (BPF_ADD_CONST64). The delta path computes: r8 = r7 + (delta_r8 - delta_r7) = 0xFFFFFFFF + (2 - 1) = 0x100000000 Then, because known_reg->id has BPF_ADD_CONST32, zext_32_to_64(r8) is called, truncating r8 to [0, 0]. But r8 used a 64-bit ALU op -- the CPU does NOT zero-extend it. The actual CPU value of r8 is 0xFFFFFFFE + 2 = 0x100000000, not 0. The verifier now underestimates r8's 64-bit bounds, which is a soundness violation. Fix sync_linked_regs() by skipping propagation when the two registers have mixed ALU widths (one BPF_ADD_CONST32, the other BPF_ADD_CONST64). Lastly, fix regsafe() used for path pruning: the existing checks used "& BPF_ADD_CONST" to test for offset linkage, which treated BPF_ADD_CONST32 and BPF_ADD_CONST64 as equivalent. Fixes: 7a433e519364 ("bpf: Support negative offsets, BPF_SUB, and alu32 for linked register tracking") Reported-by: Jenny Guanni Qu Co-developed-by: Puranjay Mohan Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260319211507.213816-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62377bcb66fd..f108c01ff6d0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17415,6 +17415,12 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) continue; + /* + * Skip mixed 32/64-bit links: the delta relationship doesn't + * hold across different ALU widths. + */ + if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST) + continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { s32 saved_subreg_def = reg->subreg_def; @@ -17442,7 +17448,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); - if (known_reg->id & BPF_ADD_CONST32) + if ((reg->id | known_reg->id) & BPF_ADD_CONST32) zext_32_to_64(reg); reg_bounds_sync(reg); } @@ -19870,11 +19876,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * Also verify that new value satisfies old value range knowledge. */ - /* ADD_CONST mismatch: different linking semantics */ - if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST)) - return false; - - if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST)) + /* + * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and + * BPF_ADD_CONST64 have different linking semantics in + * sync_linked_regs() (alu32 zero-extends, alu64 does not), + * so pruning across different flag types is unsafe. + */ + if (rold->id && + (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) return false; /* Both have offset linkage: offsets must match */ -- cgit v1.2.3 From edca33a56297d5741ccf867669debec116681987 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Mar 2026 13:07:34 -0400 Subject: tracing: Fix failure to read user space from system call trace events The system call trace events call trace_user_fault_read() to read the user space part of some system calls. This is done by grabbing a per-cpu buffer, disabling migration, enabling preemption, calling copy_from_user(), disabling preemption, enabling migration and checking if the task was preempted while preemption was enabled. If it was, the buffer is considered corrupted and it tries again. There's a safety mechanism that will fail out of this loop if it fails 100 times (with a warning). That warning message was triggered in some pi_futex stress tests. Enabling the sched_switch trace event and traceoff_on_warning, showed the problem: pi_mutex_hammer-1375 [006] d..21 138.981648: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981651: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981656: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981659: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981664: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981667: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981671: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981675: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981679: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981682: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981687: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981690: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981695: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981698: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981703: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981706: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981711: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981714: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981719: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981722: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981727: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981730: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 pi_mutex_hammer-1375 [006] d..21 138.981735: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0 migration/6-47 [006] d..2. 138.981738: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95 What happened was the task 1375 was flagged to be migrated. When preemption was enabled, the migration thread woke up to migrate that task, but failed because migration for that task was disabled. This caused the loop to fail to exit because the task scheduled out while trying to read user space. Every time the task enabled preemption the migration thread would schedule in, try to migrate the task, fail and let the task continue. But because the loop would only enable preemption with migration disabled, it would always fail because each time it enabled preemption to read user space, the migration thread would try to migrate it. To solve this, when the loop fails to read user space without being scheduled out, enabled and disable preemption with migration enabled. This will allow the migration task to successfully migrate the task and the next loop should succeed to read user space without being scheduled out. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260316130734.1858a998@gandalf.local.home Fixes: 64cf7d058a005 ("tracing: Have trace_marker use per-cpu data to read user space") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebd996f8710e..bb4a62f4b953 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6783,6 +6783,23 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo, */ do { + /* + * It is possible that something is trying to migrate this + * task. What happens then, is when preemption is enabled, + * the migration thread will preempt this task, try to + * migrate it, fail, then let it run again. That will + * cause this to loop again and never succeed. + * On failures, enabled and disable preemption with + * migration enabled, to allow the migration thread to + * migrate this task. + */ + if (trys) { + preempt_enable_notrace(); + preempt_disable_notrace(); + cpu = smp_processor_id(); + buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + } + /* * If for some reason, copy_from_user() always causes a context * switch, this would then cause an infinite loop. -- cgit v1.2.3 From 07183aac4a6828e474f00b37c9d795d0d99e18a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Mar 2026 18:55:12 -0400 Subject: tracing: Fix trace_marker copy link list updates When the "copy_trace_marker" option is enabled for an instance, anything written into /sys/kernel/tracing/trace_marker is also copied into that instances buffer. When the option is set, that instance's trace_array descriptor is added to the marker_copies link list. This list is protected by RCU, as all iterations uses an RCU protected list traversal. When the instance is deleted, all the flags that were enabled are cleared. This also clears the copy_trace_marker flag and removes the trace_array descriptor from the list. The issue is after the flags are called, a direct call to update_marker_trace() is performed to clear the flag. This function returns true if the state of the flag changed and false otherwise. If it returns true here, synchronize_rcu() is called to make sure all readers see that its removed from the list. But since the flag was already cleared, the state does not change and the synchronization is never called, leaving a possible UAF bug. Move the clearing of all flags below the updating of the copy_trace_marker option which then makes sure the synchronization is performed. Also use the flag for checking the state in update_marker_trace() instead of looking at if the list is empty. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260318185512.1b6c7db4@gandalf.local.home Fixes: 7b382efd5e8a ("tracing: Allow the top level trace_marker to write into another instances") Reported-by: Sasha Levin Closes: https://lore.kernel.org/all/20260225133122.237275-1-sashal@kernel.org/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb4a62f4b953..a626211ceb9a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -555,7 +555,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) lockdep_assert_held(&event_mutex); if (enabled) { - if (!list_empty(&tr->marker_list)) + if (tr->trace_flags & TRACE_ITER(COPY_MARKER)) return false; list_add_rcu(&tr->marker_list, &marker_copies); @@ -563,10 +563,10 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) return true; } - if (list_empty(&tr->marker_list)) + if (!(tr->trace_flags & TRACE_ITER(COPY_MARKER))) return false; - list_del_init(&tr->marker_list); + list_del_rcu(&tr->marker_list); tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); return true; } @@ -9761,18 +9761,19 @@ static int __remove_instance(struct trace_array *tr) list_del(&tr->list); - /* Disable all the flags that were enabled coming in */ - for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { - if ((1ULL << i) & ZEROED_TRACE_FLAGS) - set_tracer_flag(tr, 1ULL << i, 0); - } - if (printk_trace == tr) update_printk_trace(&global_trace); + /* Must be done before disabling all the flags */ if (update_marker_trace(tr, 0)) synchronize_rcu(); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1ULL << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1ULL << i, 0); + } + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); -- cgit v1.2.3 From f35dbac6942171dc4ce9398d1d216a59224590a9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 19 Mar 2026 18:12:19 +0900 Subject: ring-buffer: Fix to update per-subbuf entries of persistent ring buffer Since the validation loop in rb_meta_validate_events() updates the same cpu_buffer->head_page->entries, the other subbuf entries are not updated. Fix to use head_page to update the entries field, since it is the cursor in this loop. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Ian Rogers Fixes: 5f3b6e839f3c ("ring-buffer: Validate boot range memory events") Link: https://patch.msgid.link/177391153882.193994.17158784065013676533.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 17d0ea0cc3e6..170170bd83bd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2053,7 +2053,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entries += ret; entry_bytes += local_read(&head_page->page->commit); - local_set(&cpu_buffer->head_page->entries, ret); + local_set(&head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; -- cgit v1.2.3 From 50b35c9e50a865600344ab1d8f9a8b3384d7e63d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2026 13:37:38 +0100 Subject: ftrace: Use hash argument for tmp_ops in update_ftrace_direct_mod The modify logic registers temporary ftrace_ops object (tmp_ops) to trigger the slow path for all direct callers to be able to safely modify attached addresses. At the moment we use ops->func_hash for tmp_ops filter, which represents all the systems attachments. It's faster to use just the passed hash filter, which contains only the modified sites and is always a subset of the ops->func_hash. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Menglong Dong Cc: Song Liu Link: https://patch.msgid.link/20260312123738.129926-1-jolsa@kernel.org Fixes: e93672f770d7 ("ftrace: Add update_ftrace_direct_mod function") Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8df69e702706..413310912609 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6606,9 +6606,9 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b if (!orig_hash) goto unlock; - /* Enable the tmp_ops to have the same functions as the direct ops */ + /* Enable the tmp_ops to have the same functions as the hash object. */ ftrace_ops_init(&tmp_ops); - tmp_ops.func_hash = ops->func_hash; + tmp_ops.func_hash->filter_hash = hash; err = register_ftrace_function_nolock(&tmp_ops); if (err) -- cgit v1.2.3 From 175b45ed343a9c547b5f45293d3ea08d38a7b6f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Mar 2026 04:12:58 -0700 Subject: srcu: Use raw spinlocks so call_srcu() can be used under preempt_disable() Tree SRCU has used non-raw spinlocks for many years, motivated by a desire to avoid unnecessary real-time latency and the absence of any reason to use raw spinlocks. However, the recent use of SRCU in tracing as the underlying implementation of RCU Tasks Trace means that call_srcu() is invoked from preemption-disabled regions of code, which in turn requires that any locks acquired by call_srcu() or its callees must be raw spinlocks. This commit therefore converts SRCU's spinlocks to raw spinlocks. [boqun: Add Fixes tag] Reported-by: Kumar Kartikeya Dwivedi Fixes: c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng Cc: Sebastian Andrzej Siewior --- kernel/rcu/rcu.h | 9 +++ kernel/rcu/srcutree.c | 174 ++++++++++++++++++++------------------------------ 2 files changed, 78 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index dc5d614b372c..9b10b57b79ad 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -502,6 +502,15 @@ do { \ ___locked; \ }) +#define raw_spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = raw_spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index aef8e91ad33e..2328827f8775 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -77,42 +77,6 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); -/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) - -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_unlock_irq_rcu_node(p) \ - spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) - -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_trylock_irqsave_rcu_node(p, flags) \ -({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - \ - if (___locked) \ - smp_mb__after_unlock_lock(); \ - ___locked; \ -}) - -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ - /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -131,7 +95,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; @@ -186,7 +150,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -242,7 +206,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->srcu_sup) return -ENOMEM; if (!is_static) - spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); @@ -394,20 +358,20 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) /* Double-checked locking on ->srcu_size-state. */ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* * Check to see if the just-encountered contention event justifies * a transition to SRCU_SIZE_BIG. */ -static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +static void raw_spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; @@ -429,16 +393,16 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module * parameter permits this. */ -static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +static void raw_spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) { struct srcu_struct *ssp = sdp->ssp; - if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + if (raw_spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_rcu_node(sdp, *flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_check_contention(ssp); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_rcu_node(sdp, *flags); } /* @@ -447,12 +411,12 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module * parameter permits this. */ -static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +static void raw_spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) + if (raw_spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_check_contention(ssp); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_check_contention(ssp); } /* @@ -470,13 +434,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -742,9 +706,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); delay = srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) @@ -960,7 +924,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(sup); + raw_spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); if (srcu_gp_is_expedited(ssp)) @@ -971,7 +935,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -983,7 +947,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); + raw_spin_lock_irq_rcu_node(snp); cbs = false; last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) @@ -998,7 +962,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) else mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); + raw_spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); } @@ -1008,27 +972,27 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(sup); + raw_spin_lock_irq_rcu_node(sup); gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -1059,19 +1023,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; - spin_lock_irqsave_rcu_node(snp, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); sgsne = snp->srcu_gp_seq_needed_exp; if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_ssp_contention(ssp, &flags); + raw_spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -1109,12 +1073,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == snp_leaf && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); return; @@ -1129,11 +1093,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, sgsne = snp->srcu_gp_seq_needed_exp; if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_ssp_contention(ssp, &flags); + raw_spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -1160,7 +1124,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, else if (list_empty(&sup->work.work.entry)) list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(sup, flags); + raw_spin_unlock_irqrestore_rcu_node(sup, flags); } /* @@ -1172,9 +1136,9 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1285,12 +1249,12 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + raw_spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabilistically probe global state. @@ -1350,7 +1314,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); /* @@ -1410,7 +1374,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); /* Ensure that snp node tree is fully initialized before traversing it */ if (ss_state < SRCU_SIZE_WAIT_BARRIER) @@ -1522,7 +1486,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the spin_lock_irq_rcu_node() + * period. This pairs with the raw_spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1701,7 +1665,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) */ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1710,7 +1674,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); } /** @@ -1761,7 +1725,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp) bool needcb = false; struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (sdp->srcu_ec_state == SRCU_EC_IDLE) { WARN_ON_ONCE(1); } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { @@ -1771,7 +1735,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp) sdp->srcu_ec_state = SRCU_EC_PENDING; needcb = true; } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); // If needed, requeue ourselves as an expedited SRCU callback. if (needcb) __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); @@ -1795,7 +1759,7 @@ void srcu_expedite_current(struct srcu_struct *ssp) migrate_disable(); sdp = this_cpu_ptr(ssp->sda); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (sdp->srcu_ec_state == SRCU_EC_IDLE) { sdp->srcu_ec_state = SRCU_EC_PENDING; needcb = true; @@ -1804,7 +1768,7 @@ void srcu_expedite_current(struct srcu_struct *ssp) } else { WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); // If needed, queue an expedited SRCU callback. if (needcb) __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); @@ -1848,17 +1812,17 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1872,10 +1836,10 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); ssp->srcu_sup->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { @@ -1913,7 +1877,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); @@ -1924,7 +1888,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } @@ -1932,7 +1896,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); len = ready_cbs.len; - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1947,11 +1911,11 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); @@ -1965,7 +1929,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1975,7 +1939,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); @@ -1995,9 +1959,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { -- cgit v1.2.3 From 61bbcfb50514a8a94e035a7349697a3790ab4783 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Mar 2026 20:29:20 -0700 Subject: srcu: Push srcu_node allocation to GP when non-preemptible When the srcutree.convert_to_big and srcutree.big_cpu_lim kernel boot parameters specify initialization-time allocation of the srcu_node tree for statically allocated srcu_struct structures (for example, in DEFINE_SRCU() at build time instead of init_srcu_struct() at runtime), init_srcu_struct_nodes() will attempt to dynamically allocate this tree at the first run-time update-side use of this srcu_struct structure, but while holding a raw spinlock. Because the memory allocator can acquire non-raw spinlocks, this can result in lockdep splats. This commit therefore uses the same SRCU_SIZE_ALLOC trick that is used when the first run-time update-side use of this srcu_struct structure happens before srcu_init() is called. The actual allocation then takes place from workqueue context at the ends of upcoming SRCU grace periods. [boqun: Adjust the sha1 of the Fixes tag] Fixes: 175b45ed343a ("srcu: Use raw spinlocks so call_srcu() can be used under preempt_disable()") Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2328827f8775..678bd9a73875 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -227,9 +227,12 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL)) + if (!preemptible()) + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); + else if (init_srcu_struct_nodes(ssp, GFP_KERNEL)) + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); + else goto err_free_sda; - WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, -- cgit v1.2.3 From 7c405fb3279b39244b260b54f1bd6488689ae235 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 18 Mar 2026 17:56:21 -0700 Subject: rcu: Use an intermediate irq_work to start process_srcu() Since commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") we switched to SRCU in BPF. However as BPF instrument can happen basically everywhere (including where a scheduler lock is held), call_srcu() now needs to avoid acquiring scheduler lock because otherwise it could cause deadlock [1]. Fix this by following what the previous RCU Tasks Trace did: using an irq_work to delay the queuing of the work to start process_srcu(). [boqun: Apply Joel's feedback] [boqun: Apply Andrea's test feedback] Reported-by: Andrea Righi Closes: https://lore.kernel.org/all/abjzvz_tL_siV17s@gpd4/ Fixes: commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") Link: https://lore.kernel.org/rcu/3c4c5a29-24ea-492d-aeee-e0d9605b4183@nvidia.com/ [1] Suggested-by: Zqiang Tested-by: Andrea Righi Tested-by: Paul E. McKenney Tested-by: Joel Fernandes Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 678bd9a73875..0d01cd8c4b4a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_irq_work(struct irq_work *work); static void srcu_delay_timer(struct timer_list *t); /* @@ -216,6 +218,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work); ssp->srcu_sup->sda_is_static = is_static; if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); @@ -716,6 +719,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ + /* Wait for irq_work to finish first as it may queue a new work. */ + irq_work_sync(&sup->irq_work); flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -1121,9 +1126,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // it isn't. And it does not have to be. After all, it // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. + // + // Use an irq_work here to avoid acquiring runqueue lock with + // srcu rcu_node::lock held. BPF instrument could introduce the + // opposite dependency, hence we need to break the possible + // locking dependency here. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &sup->work, - !!srcu_get_delay(ssp)); + irq_work_queue(&sup->irq_work); else if (list_empty(&sup->work.work.entry)) list_add(&sup->work.work.entry, &srcu_boot_list); } @@ -1982,6 +1991,23 @@ static void process_srcu(struct work_struct *work) srcu_reschedule(ssp, curdelay); } +static void srcu_irq_work(struct irq_work *work) +{ + struct srcu_struct *ssp; + struct srcu_usage *sup; + unsigned long delay; + unsigned long flags; + + sup = container_of(work, struct srcu_usage, irq_work); + ssp = sup->srcu_ssp; + + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + delay = srcu_get_delay(ssp); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + + queue_delayed_work(rcu_gp_wq, &sup->work, !!delay); +} + void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { -- cgit v1.2.3 From a6fc88b22bc8d12ad52e8412c667ec0f5bf055af Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 23 Mar 2026 20:14:18 -0400 Subject: srcu: Use irq_work to start GP in tiny SRCU Tiny SRCU's srcu_gp_start_if_needed() directly calls schedule_work(), which acquires the workqueue pool->lock. This causes a lockdep splat when call_srcu() is called with a scheduler lock held, due to: call_srcu() [holding pi_lock] srcu_gp_start_if_needed() schedule_work() -> pool->lock workqueue_init() / create_worker() [holding pool->lock] wake_up_process() -> try_to_wake_up() -> pi_lock Also add irq_work_sync() to cleanup_srcu_struct() to prevent a use-after-free if a queued irq_work fires after cleanup begins. Tested with rcutorture SRCU-T and no lockdep warnings. [ Thanks to Boqun for similar fix in patch "rcu: Use an intermediate irq_work to start process_srcu()" ] Signed-off-by: Joel Fernandes Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/srcutiny.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 3450c3751ef7..a2e2d516e51b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -41,6 +42,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); + init_irq_work(&ssp->srcu_irq_work, srcu_tiny_irq_work); return 0; } @@ -84,6 +86,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); + irq_work_sync(&ssp->srcu_irq_work); flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); @@ -177,6 +180,20 @@ void srcu_drive_gp(struct work_struct *wp) } EXPORT_SYMBOL_GPL(srcu_drive_gp); +/* + * Use an irq_work to defer schedule_work() to avoid acquiring the workqueue + * pool->lock while the caller might hold scheduler locks, causing lockdep + * splats due to workqueue_init() doing a wakeup. + */ +void srcu_tiny_irq_work(struct irq_work *irq_work) +{ + struct srcu_struct *ssp; + + ssp = container_of(irq_work, struct srcu_struct, srcu_irq_work); + schedule_work(&ssp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_tiny_irq_work); + static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; @@ -189,7 +206,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); + irq_work_queue(&ssp->srcu_irq_work); else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } -- cgit v1.2.3