From 23099792bb6fafe4aa4063e261a17e2bb743026f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Jul 2025 01:47:30 +0200 Subject: bpf: Add cookie object to bpf maps [ Upstream commit 12df58ad294253ac1d8df0c9bb9cf726397a671d ] Add a cookie to BPF maps to uniquely identify BPF maps for the timespan when the node is up. This is different to comparing a pointer or BPF map id which could get rolled over and reused. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250730234733.530041-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6f309248f13f..6d4d08f57ad3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -43,6 +44,7 @@ #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); +DEFINE_COOKIE(bpf_map_cookie); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); @@ -886,6 +888,10 @@ static int map_create(union bpf_attr *attr) if (err < 0) goto free_map; + preempt_disable(); + map->cookie = gen_cookie_next(&bpf_map_cookie); + preempt_enable(); + atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); -- cgit v1.2.3 From c04992612ed441c1970e58dbbdd01771f76e2c5e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Sep 2025 17:19:45 -0400 Subject: bpf: Move bpf map owner out of common struct [ Upstream commit fd1c98f0ef5cbcec842209776505d9e70d8fcd53 ] Given this is only relevant for BPF tail call maps, it is adding up space and penalizing other map types. We also need to extend this with further objects to track / compare to. Therefore, lets move this out into a separate structure and dynamically allocate it only for BPF tail call maps. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250730234733.530041-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/arraymap.c | 1 - kernel/bpf/core.c | 58 ++++++++++++++++++++++++++++++++++++++------------- kernel/bpf/syscall.c | 16 +++++++------- 3 files changed, 51 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2788da290c21..dc42970dda97 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1044,7 +1044,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); - spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1ded3eb492b8..aa3487e24454 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1832,31 +1832,59 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, } #endif -bool bpf_prog_array_compatible(struct bpf_array *array, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { - bool ret; + enum bpf_prog_type prog_type = fp->aux->dst_prog ? fp->aux->dst_prog->type : fp->type; + struct bpf_prog_aux *aux = fp->aux; + bool ret = false; if (fp->kprobe_override) - return false; - - spin_lock(&array->aux->owner.lock); + return ret; - if (!array->aux->owner.type) { - /* There's no owner yet where we could check for - * compatibility. - */ - array->aux->owner.type = fp->type; - array->aux->owner.jited = fp->jited; + spin_lock(&map->owner_lock); + /* There's no owner yet where we could check for compatibility. */ + if (!map->owner) { + map->owner = bpf_map_owner_alloc(map); + if (!map->owner) + goto err; + map->owner->type = prog_type; + map->owner->jited = fp->jited; + /* Note: xdp_has_frags doesn't exist in aux yet in our branch */ + /* map->owner->xdp_has_frags = aux->xdp_has_frags; */ + map->owner->attach_func_proto = aux->attach_func_proto; ret = true; } else { - ret = array->aux->owner.type == fp->type && - array->aux->owner.jited == fp->jited; + ret = map->owner->type == prog_type && + map->owner->jited == fp->jited; + /* Note: xdp_has_frags check would go here when available */ + /* && map->owner->xdp_has_frags == aux->xdp_has_frags; */ + if (ret && + map->owner->attach_func_proto != aux->attach_func_proto) { + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_STRUCT_OPS: + ret = false; + break; + default: + break; + } + } } - spin_unlock(&array->aux->owner.lock); +err: + spin_unlock(&map->owner_lock); return ret; } +bool bpf_prog_array_compatible(struct bpf_array *array, + const struct bpf_prog *fp) +{ + struct bpf_map *map = &array->map; + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d4d08f57ad3..b80d125dcea9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -477,6 +477,7 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); bpf_map_release_memcg(map); + bpf_map_owner_free(map); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -576,17 +577,15 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { - const struct bpf_map *map = filp->private_data; - const struct bpf_array *array; + struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { - array = container_of(map, struct bpf_array, map); - spin_lock(&array->aux->owner.lock); - type = array->aux->owner.type; - jited = array->aux->owner.jited; - spin_unlock(&array->aux->owner.lock); + spin_lock(&map->owner_lock); + if (map->owner) { + type = map->owner->type; + jited = map->owner->jited; } + spin_unlock(&map->owner_lock); seq_printf(m, "map_type:\t%u\n" @@ -895,6 +894,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); + spin_lock_init(&map->owner_lock); map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; -- cgit v1.2.3 From c1c74584b9b4043c52e41fec415226e582d266a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Jul 2025 01:47:33 +0200 Subject: bpf: Fix oob access in cgroup local storage [ Upstream commit abad3d0bad72a52137e0c350c59542d75ae4f513 ] Lonial reported that an out-of-bounds access in cgroup local storage can be crafted via tail calls. Given two programs each utilizing a cgroup local storage with a different value size, and one program doing a tail call into the other. The verifier will validate each of the indivial programs just fine. However, in the runtime context the bpf_cg_run_ctx holds an bpf_prog_array_item which contains the BPF program as well as any cgroup local storage flavor the program uses. Helpers such as bpf_get_local_storage() pick this up from the runtime context: ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; else ptr = this_cpu_ptr(storage->percpu_buf); For the second program which was called from the originally attached one, this means bpf_get_local_storage() will pick up the former program's map, not its own. With mismatching sizes, this can result in an unintended out-of-bounds access. To fix this issue, we need to extend bpf_map_owner with an array of storage_cookie[] to match on i) the exact maps from the original program if the second program was using bpf_get_local_storage(), or ii) allow the tail call combination if the second program was not using any of the cgroup local storage maps. Fixes: 7d9c3427894f ("bpf: Make cgroup storages shared between programs on the same cgroup") Reported-by: Lonial Con Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250730234733.530041-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa3487e24454..73a1c66e5417 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1837,7 +1837,9 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, { enum bpf_prog_type prog_type = fp->aux->dst_prog ? fp->aux->dst_prog->type : fp->type; struct bpf_prog_aux *aux = fp->aux; + enum bpf_cgroup_storage_type i; bool ret = false; + u64 cookie; if (fp->kprobe_override) return ret; @@ -1853,12 +1855,25 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, /* Note: xdp_has_frags doesn't exist in aux yet in our branch */ /* map->owner->xdp_has_frags = aux->xdp_has_frags; */ map->owner->attach_func_proto = aux->attach_func_proto; + for_each_cgroup_storage_type(i) { + map->owner->storage_cookie[i] = + aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + } ret = true; } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited; /* Note: xdp_has_frags check would go here when available */ /* && map->owner->xdp_has_frags == aux->xdp_has_frags; */ + for_each_cgroup_storage_type(i) { + if (!ret) + break; + cookie = aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + ret = map->owner->storage_cookie[i] == cookie || + !cookie; + } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { -- cgit v1.2.3 From c824d766e472d7357f7ddca150875b6e01530607 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 6 Sep 2025 12:25:54 -0400 Subject: cpufreq/sched: Explicitly synchronize limits_changed flag handling [ Upstream commit 79443a7e9da3c9f68290a8653837e23aba0fa89f ] The handling of the limits_changed flag in struct sugov_policy needs to be explicitly synchronized to ensure that cpufreq policy limits updates will not be missed in some cases. Without that synchronization it is theoretically possible that the limits_changed update in sugov_should_update_freq() will be reordered with respect to the reads of the policy limits in cpufreq_driver_resolve_freq() and in that case, if the limits_changed update in sugov_limits() clobbers the one in sugov_should_update_freq(), the new policy limits may not take effect for a long time. Likewise, the limits_changed update in sugov_limits() may theoretically get reordered with respect to the updates of the policy limits in cpufreq_set_policy() and if sugov_should_update_freq() runs between them, the policy limits change may be missed. To ensure that the above situations will not take place, add memory barriers preventing the reordering in question from taking place and add READ_ONCE() and WRITE_ONCE() annotations around all of the limits_changed flag updates to prevent the compiler from messing up with that code. Fixes: 600f5badb78c ("cpufreq: schedutil: Don't skip freq update when limits change") Cc: 5.3+ # 5.3+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/3376719.44csPzL39Z@rjwysocki.net [ bw_min => bw_dl ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cpufreq_schedutil.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 519f742d44f4..954a85b8c275 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -89,9 +89,20 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + return true; } @@ -326,7 +337,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_cpu->sg_policy->limits_changed = true; + WRITE_ONCE(sg_cpu->sg_policy->limits_changed, true); } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -826,7 +837,16 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } struct cpufreq_governor schedutil_gov = { -- cgit v1.2.3 From 1cdb41d4f08a6c8081c51dab2e78c0b0412006e6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 7 Sep 2025 20:23:21 -0400 Subject: tracing: Do not add length to print format in synthetic events [ Upstream commit e1a453a57bc76be678bd746f84e3d73f378a9511 ] The following causes a vsnprintf fault: # echo 's:wake_lat char[] wakee; u64 delta;' >> /sys/kernel/tracing/dynamic_events # echo 'hist:keys=pid:ts=common_timestamp.usecs if !(common_flags & 0x18)' > /sys/kernel/tracing/events/sched/sched_waking/trigger # echo 'hist:keys=next_pid:delta=common_timestamp.usecs-$ts:onmatch(sched.sched_waking).trace(wake_lat,next_comm,$delta)' > /sys/kernel/tracing/events/sched/sched_switch/trigger Because the synthetic event's "wakee" field is created as a dynamic string (even though the string copied is not). The print format to print the dynamic string changed from "%*s" to "%s" because another location (__set_synth_event_print_fmt()) exported this to user space, and user space did not need that. But it is still used in print_synth_event(), and the output looks like: -0 [001] d..5. 193.428167: wake_lat: wakee=(efault)sshd-sessiondelta=155 sshd-session-879 [001] d..5. 193.811080: wake_lat: wakee=(efault)kworker/u34:5delta=58 -0 [002] d..5. 193.811198: wake_lat: wakee=(efault)bashdelta=91 bash-880 [002] d..5. 193.811371: wake_lat: wakee=(efault)kworker/u35:2delta=21 -0 [001] d..5. 193.811516: wake_lat: wakee=(efault)sshd-sessiondelta=129 sshd-session-879 [001] d..5. 193.967576: wake_lat: wakee=(efault)kworker/u34:5delta=50 The length isn't needed as the string is always nul terminated. Just print the string and not add the length (which was hard coded to the max string length anyway). Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Tom Zanussi Cc: Douglas Raillard Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/20250407154139.69955768@gandalf.local.home Fixes: 4d38328eb442d ("tracing: Fix synth event printk format for str fields"); Signed-off-by: Steven Rostedt (Google) [ offset calculations instead of union-based data structures ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events_synth.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index ab54810bd8d9..62d146254f47 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -364,13 +364,11 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, str_field = (char *)entry + data_offset; trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, str_field, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)&entry->fields[n_u64], i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); -- cgit v1.2.3 From 9a38cd92493c2e7a89379f6e1c248c3766cf731a Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Tue, 19 Aug 2025 10:51:52 +0000 Subject: tracing: Fix tracing_marker may trigger page fault during preempt_disable [ Upstream commit 3d62ab32df065e4a7797204a918f6489ddb8a237 ] Both tracing_mark_write and tracing_mark_raw_write call __copy_from_user_inatomic during preempt_disable. But in some case, __copy_from_user_inatomic may trigger page fault, and will call schedule() subtly. And if a task is migrated to other cpu, the following warning will be trigger: if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) An example can illustrate this issue: process flow CPU --------------------------------------------------------------------- tracing_mark_raw_write(): cpu:0 ... ring_buffer_lock_reserve(): cpu:0 ... cpu = raw_smp_processor_id() cpu:0 cpu_buffer = buffer->buffers[cpu] cpu:0 ... ... __copy_from_user_inatomic(): cpu:0 ... # page fault do_mem_abort(): cpu:0 ... # Call schedule schedule() cpu:0 ... # the task schedule to cpu1 __buffer_unlock_commit(): cpu:1 ... ring_buffer_unlock_commit(): cpu:1 ... cpu = raw_smp_processor_id() cpu:1 cpu_buffer = buffer->buffers[cpu] cpu:1 As shown above, the process will acquire cpuid twice and the return values are not the same. To fix this problem using copy_from_user_nofault instead of __copy_from_user_inatomic, as the former performs 'access_ok' before copying. Link: https://lore.kernel.org/20250819105152.2766363-1-luogengkun@huaweicloud.com Fixes: 656c7f0d2d2b ("tracing: Replace kmap with copy_from_user() in trace_marker writing") Signed-off-by: Luo Gengkun Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7af8bbc57531..a6040a707abb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7233,7 +7233,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7308,7 +7308,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); -- cgit v1.2.3 From e7ddb59a63cb8edff094d835ebd03dc44cc17c3e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Sep 2021 11:24:17 -0400 Subject: genirq: Provide new interfaces for affinity hints [ Upstream commit 65c7cdedeb3026fabcc967a7aae2f755ad4d0783 ] The discussion about removing the side effect of irq_set_affinity_hint() of actually applying the cpumask (if not NULL) as affinity to the interrupt, unearthed a few unpleasantries: 1) The modular perf drivers rely on the current behaviour for the very wrong reasons. 2) While none of the other drivers prevents user space from changing the affinity, a cursorily inspection shows that there are at least expectations in some drivers. #1 needs to be cleaned up anyway, so that's not a problem #2 might result in subtle regressions especially when irqbalanced (which nowadays ignores the affinity hint) is disabled. Provide new interfaces: irq_update_affinity_hint() - Only sets the affinity hint pointer irq_set_affinity_and_hint() - Set the pointer and apply the affinity to the interrupt Make irq_set_affinity_hint() a wrapper around irq_apply_affinity_hint() and document it to be phased out. Signed-off-by: Thomas Gleixner Signed-off-by: Nitesh Narayan Lal Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210501021832.743094-1-jesse.brandeburg@intel.com Link: https://lore.kernel.org/r/20210903152430.244937-2-nitesh@redhat.com Stable-dep-of: 915470e1b44e ("i40e: fix IRQ freeing in i40e_vsi_request_irq_msix error path") Signed-off-by: Sasha Levin --- kernel/irq/manage.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b46fbfbb929f..ce0433446a8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -501,7 +501,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, + bool setaffinity) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); @@ -510,12 +511,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); - /* set the initial affinity to prevent every interrupt being on CPU0 */ - if (m) + if (m && setaffinity) __irq_set_affinity(irq, m, false); return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { -- cgit v1.2.3 From 95b76ebeb0f14df284fae8e8bb52b4e3d51e6160 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 22 Mar 2024 15:04:41 +0800 Subject: hrtimer: Remove unused function [ Upstream commit 82ccdf062a64f3c4ac575c16179ce68edbbbe8e4 ] The function is defined, but not called anywhere: kernel/time/hrtimer.c:1880:20: warning: unused function '__hrtimer_peek_ahead_timers'. Remove it. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240322070441.29646-1-jiapeng.chong@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8611 Stable-dep-of: e895f8e29119 ("hrtimers: Unconditionally update target CPU base after offline timer migration") Signed-off-by: Sasha Levin --- kernel/time/hrtimer.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e4b63f3c6dd..a8fbf4b1ea19 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1873,25 +1873,7 @@ retry: tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } - -/* called with interrupts disabled */ -static inline void __hrtimer_peek_ahead_timers(void) -{ - struct tick_device *td; - - if (!hrtimer_hres_active()) - return; - - td = this_cpu_ptr(&tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); -} - -#else /* CONFIG_HIGH_RES_TIMERS */ - -static inline void __hrtimer_peek_ahead_timers(void) { } - -#endif /* !CONFIG_HIGH_RES_TIMERS */ +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy -- cgit v1.2.3 From e90b685c5f2a8494e365809a870d78d6c0a6204e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 18 Apr 2024 10:30:00 +0800 Subject: hrtimer: Rename __hrtimer_hres_active() to hrtimer_hres_active() [ Upstream commit b7c8e1f8a7b4352c1d0b4310686385e3cf6c104a ] The function hrtimer_hres_active() are defined in the hrtimer.c file, but not called elsewhere, so rename __hrtimer_hres_active() to hrtimer_hres_active() and remove the old hrtimer_hres_active() function. kernel/time/hrtimer.c:653:19: warning: unused function 'hrtimer_hres_active'. Fixes: 82ccdf062a64 ("hrtimer: Remove unused function") Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/20240418023000.130324-1-jiapeng.chong@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8778 Stable-dep-of: e895f8e29119 ("hrtimers: Unconditionally update target CPU base after offline timer migration") Signed-off-by: Sasha Levin --- kernel/time/hrtimer.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a8fbf4b1ea19..74a71b3a064d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -631,17 +631,12 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * Is the high resolution mode active ? */ -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } -static inline int hrtimer_hres_active(void) -{ - return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); -} - static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) @@ -665,7 +660,7 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, * set. So we'd effectively block all timers until the T2 event * fires. */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); @@ -776,12 +771,12 @@ static void retrigger_next_event(void *arg) * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ - if (!__hrtimer_hres_active(base) && !tick_nohz_active) + if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); - if (__hrtimer_hres_active(base)) + if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); @@ -938,7 +933,7 @@ void clock_was_set(unsigned int bases) cpumask_var_t mask; int cpu; - if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) + if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -1489,7 +1484,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!__hrtimer_hres_active(cpu_base)) + if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1512,7 +1507,7 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (__hrtimer_hres_active(cpu_base)) { + if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { @@ -1884,7 +1879,7 @@ void hrtimer_run_queues(void) unsigned long flags; ktime_t now; - if (__hrtimer_hres_active(cpu_base)) + if (hrtimer_hres_active(cpu_base)) return; /* -- cgit v1.2.3 From 24a65b46cd663ca8063c457c470bfaed2dc454d4 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 5 Aug 2025 16:10:25 +0800 Subject: hrtimers: Unconditionally update target CPU base after offline timer migration [ Upstream commit e895f8e29119c8c966ea794af9e9100b10becb88 ] When testing softirq based hrtimers on an ARM32 board, with high resolution mode and NOHZ inactive, softirq based hrtimers fail to expire after being moved away from an offline CPU: CPU0 CPU1 hrtimer_start(..., HRTIMER_MODE_SOFT); cpu_down(CPU1) ... hrtimers_cpu_dying() // Migrate timers to CPU0 smp_call_function_single(CPU0, returgger_next_event); retrigger_next_event() if (!highres && !nohz) return; As retrigger_next_event() is a NOOP when both high resolution timers and NOHZ are inactive CPU0's hrtimer_cpu_base::softirq_expires_next is not updated and the migrated softirq timers never expire unless there is a softirq based hrtimer queued on CPU0 later. Fix this by removing the hrtimer_hres_active() and tick_nohz_active() check in retrigger_next_event(), which enforces a full update of the CPU base. As this is not a fast path the extra cost does not matter. [ tglx: Massaged change log ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Xiongfeng Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250805081025.54235-1-wangxiongfeng2@huawei.com Signed-off-by: Sasha Levin --- kernel/time/hrtimer.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 74a71b3a064d..7e2ed34e9803 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -770,10 +770,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2229,11 +2229,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); -- cgit v1.2.3 From f2795d1b92506e3adf52a298f7181032a1525e04 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 19 Aug 2025 01:07:24 +0000 Subject: cgroup: split cgroup_destroy_wq into 3 workqueues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 79f919a89c9d06816dbdbbd168fa41d27411a7f9 ] A hung task can occur during [1] LTP cgroup testing when repeatedly mounting/unmounting perf_event and net_prio controllers with systemd.unified_cgroup_hierarchy=1. The hang manifests in cgroup_lock_and_drain_offline() during root destruction. Related case: cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio Call Trace: cgroup_lock_and_drain_offline+0x14c/0x1e8 cgroup_destroy_root+0x3c/0x2c0 css_free_rwork_fn+0x248/0x338 process_one_work+0x16c/0x3b8 worker_thread+0x22c/0x3b0 kthread+0xec/0x100 ret_from_fork+0x10/0x20 Root Cause: CPU0 CPU1 mount perf_event umount net_prio cgroup1_get_tree cgroup_kill_sb rebind_subsystems // root destruction enqueues // cgroup_destroy_wq // kill all perf_event css // one perf_event css A is dying // css A offline enqueues cgroup_destroy_wq // root destruction will be executed first css_free_rwork_fn cgroup_destroy_root cgroup_lock_and_drain_offline // some perf descendants are dying // cgroup_destroy_wq max_active = 1 // waiting for css A to die Problem scenario: 1. CPU0 mounts perf_event (rebind_subsystems) 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work 3. A dying perf_event CSS gets queued for offline after root destruction 4. Root destruction waits for offline completion, but offline work is blocked behind root destruction in cgroup_destroy_wq (max_active=1) Solution: Split cgroup_destroy_wq into three dedicated workqueues: cgroup_offline_wq – Handles CSS offline operations cgroup_release_wq – Manages resource release cgroup_free_wq – Performs final memory deallocation This separation eliminates blocking in the CSS free path while waiting for offline operations to complete. [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Reported-by: Gao Yingjie Signed-off-by: Chen Ridong Suggested-by: Teju Heo Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/cgroup/cgroup.c | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1a3b2e1436db..e5fe4ffff7cd 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -122,8 +122,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -5263,7 +5286,7 @@ static void css_release_work_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5272,7 +5295,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5394,7 +5417,7 @@ err_list_del: err_free_css: list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5631,7 +5654,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6008,8 +6031,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); -- cgit v1.2.3 From d51c6b51981fa760aa28bbfffe5b5a4ea1b59d7b Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:39 +0800 Subject: bpf: Reject bpf_timer for PREEMPT_RT [ Upstream commit e25ddfb388c8b7e5f20e3bf38d627fb485003781 ] When enable CONFIG_PREEMPT_RT, the kernel will warn when run timer selftests by './test_progs -t timer': BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 In order to avoid such warning, reject bpf_timer in verifier when PREEMPT_RT is enabled. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89b4fa815a9b..4b7c9a60a735 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5071,6 +5071,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verbose(env, "verifier bug. Two map pointers in a timer helper\n"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; -- cgit v1.2.3 From 0d41604d2d53c1abe27fefb54b37a8f6642a4d74 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 19 Sep 2025 10:15:56 +0900 Subject: tracing: dynevent: Add a missing lockdown check on dynevent commit 456c32e3c4316654f95f9d49c12cbecfb77d5660 upstream. Since dynamic_events interface on tracefs is compatible with kprobe_events and uprobe_events, it should also check the lockdown status and reject if it is set. Link: https://lore.kernel.org/all/175824455687.45175.3734166065458520748.stgit@devnote2/ Fixes: 17911ff38aa5 ("tracing: Add locked_down checks to the open calls of files created for tracefs") Signed-off-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_dynevent.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 6d0e9f869ad6..3d8ffa81a1fa 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -239,6 +239,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; -- cgit v1.2.3 From 0b515a2839980c94294fe5c6767a6953d465c779 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 9 Sep 2025 13:44:14 +0200 Subject: smp: Fix up and expand the smp_call_function_many() kerneldoc [ Upstream commit ccf09357ffef2ab472369ab9cdf470c9bc9b821a ] The smp_call_function_many() kerneldoc comment got out of sync with the function definition (bool parameter "wait" is incorrectly described as a bitmask in it), so fix it up by copying the "wait" description from the smp_call_function() kerneldoc and add information regarding the handling of the local CPU to it. Fixes: 49b3bd213a9f ("smp: Fix all kernel-doc warnings") Signed-off-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin --- kernel/smp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index b60525b34ab0..387df30ca560 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -976,16 +976,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait - * (atomically) until function has completed on other CPUs. If - * %SCF_RUN_LOCAL is set, the function will also be run locally - * if the local CPU is set in the @cpumask. - * - * If @wait is true, then returns once @func has returned. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. + * + * @func is not called on the local CPU even if @mask contains it. Consider + * using on_each_cpu_cond_mask() instead if this is not desirable. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) -- cgit v1.2.3 From e28616ca3d67e745ecb2f10eba4a626e1fc9a203 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Wed, 8 Oct 2025 18:26:26 +0800 Subject: bpf: Avoid RCU context warning when unpinning htab with internal structs [ Upstream commit 4f375ade6aa9f37fd72d7a78682f639772089eed ] When unpinning a BPF hash table (htab or htab_lru) that contains internal structures (timer, workqueue, or task_work) in its values, a BUG warning is triggered: BUG: sleeping function called from invalid context at kernel/bpf/hashtab.c:244 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 14, name: ksoftirqd/0 ... The issue arises from the interaction between BPF object unpinning and RCU callback mechanisms: 1. BPF object unpinning uses ->free_inode() which schedules cleanup via call_rcu(), deferring the actual freeing to an RCU callback that executes within the RCU_SOFTIRQ context. 2. During cleanup of hash tables containing internal structures, htab_map_free_internal_structs() is invoked, which includes cond_resched() or cond_resched_rcu() calls to yield the CPU during potentially long operations. However, cond_resched() or cond_resched_rcu() cannot be safely called from atomic RCU softirq context, leading to the BUG warning when attempting to reschedule. Fix this by changing from ->free_inode() to ->destroy_inode() and rename bpf_free_inode() to bpf_destroy_inode() for BPF objects (prog, map, link). This allows direct inode freeing without RCU callback scheduling, avoiding the invalid context warning. Reported-by: Le Chen Closes: https://lore.kernel.org/all/1444123482.1827743.1750996347470.JavaMail.zimbra@sjtu.edu.cn/ Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Suggested-by: Alexei Starovoitov Signed-off-by: KaFai Wan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251008102628.808045-2-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8d9f7467bf..849df8268af5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -610,7 +610,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_free_inode(struct inode *inode) +static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; @@ -625,7 +625,7 @@ static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .free_inode = bpf_free_inode, + .destroy_inode = bpf_destroy_inode, }; enum { -- cgit v1.2.3 From 83b594504d64f71ded63371923701ef4c8caaa26 Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:50 +0200 Subject: copy_sighand: Handle architectures where sizeof(unsigned long) < sizeof(u64) commit 04ff48239f46e8b493571e260bd0e6c3a6400371 upstream. With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit. However, the signature of the copy_* helper functions (e.g., copy_sighand) used by copy_process was not adapted. As such, they truncate the flags on any 32-bit architectures that supports clone3 (arc, arm, csky, m68k, microblaze, mips32, openrisc, parisc32, powerpc32, riscv32, x86-32 and xtensa). For copy_sighand with CLONE_CLEAR_SIGHAND being an actual u64 constant, this triggers an observable bug in kernel selftest clone3_clear_sighand: if (clone_flags & CLONE_CLEAR_SIGHAND) in function copy_sighand within fork.c will always fail given: unsigned long /* == uint32_t */ clone_flags #define CLONE_CLEAR_SIGHAND 0x100000000ULL This commit fixes the bug by always passing clone_flags to copy_sighand via their declared u64 type, invariant of architecture-dependent integer sizes. Fixes: b612e5df4587 ("clone3: add CLONE_CLEAR_SIGHAND") Cc: stable@vger.kernel.org # linux-5.5+ Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-1-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2fd9c431bf45..2c99d39e2bc0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1595,7 +1595,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; -- cgit v1.2.3 From 95dd33361061f808d1f68616d69ada639e737cfa Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Tue, 14 Oct 2025 07:37:14 -0400 Subject: tracing: Fix race condition in kprobe initialization causing NULL pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9cf9aa7b0acfde7545c1a1d912576e9bab28dc6f ] There is a critical race condition in kprobe initialization that can lead to NULL pointer dereference and kernel crash. [1135630.084782] Unable to handle kernel paging request at virtual address 0000710a04630000 ... [1135630.260314] pstate: 404003c9 (nZcv DAIF +PAN -UAO) [1135630.269239] pc : kprobe_perf_func+0x30/0x260 [1135630.277643] lr : kprobe_dispatcher+0x44/0x60 [1135630.286041] sp : ffffaeff4977fa40 [1135630.293441] x29: ffffaeff4977fa40 x28: ffffaf015340e400 [1135630.302837] x27: 0000000000000000 x26: 0000000000000000 [1135630.312257] x25: ffffaf029ed108a8 x24: ffffaf015340e528 [1135630.321705] x23: ffffaeff4977fc50 x22: ffffaeff4977fc50 [1135630.331154] x21: 0000000000000000 x20: ffffaeff4977fc50 [1135630.340586] x19: ffffaf015340e400 x18: 0000000000000000 [1135630.349985] x17: 0000000000000000 x16: 0000000000000000 [1135630.359285] x15: 0000000000000000 x14: 0000000000000000 [1135630.368445] x13: 0000000000000000 x12: 0000000000000000 [1135630.377473] x11: 0000000000000000 x10: 0000000000000000 [1135630.386411] x9 : 0000000000000000 x8 : 0000000000000000 [1135630.395252] x7 : 0000000000000000 x6 : 0000000000000000 [1135630.403963] x5 : 0000000000000000 x4 : 0000000000000000 [1135630.412545] x3 : 0000710a04630000 x2 : 0000000000000006 [1135630.421021] x1 : ffffaeff4977fc50 x0 : 0000710a04630000 [1135630.429410] Call trace: [1135630.434828] kprobe_perf_func+0x30/0x260 [1135630.441661] kprobe_dispatcher+0x44/0x60 [1135630.448396] aggr_pre_handler+0x70/0xc8 [1135630.454959] kprobe_breakpoint_handler+0x140/0x1e0 [1135630.462435] brk_handler+0xbc/0xd8 [1135630.468437] do_debug_exception+0x84/0x138 [1135630.475074] el1_dbg+0x18/0x8c [1135630.480582] security_file_permission+0x0/0xd0 [1135630.487426] vfs_write+0x70/0x1c0 [1135630.493059] ksys_write+0x5c/0xc8 [1135630.498638] __arm64_sys_write+0x24/0x30 [1135630.504821] el0_svc_common+0x78/0x130 [1135630.510838] el0_svc_handler+0x38/0x78 [1135630.516834] el0_svc+0x8/0x1b0 kernel/trace/trace_kprobe.c: 1308 0xffff3df8995039ec : ldr x21, [x24,#120] include/linux/compiler.h: 294 0xffff3df8995039f0 : ldr x1, [x21,x0] kernel/trace/trace_kprobe.c 1308: head = this_cpu_ptr(call->perf_events); 1309: if (hlist_empty(head)) 1310: return 0; crash> struct trace_event_call -o struct trace_event_call { ... [120] struct hlist_head *perf_events; //(call->perf_event) ... } crash> struct trace_event_call ffffaf015340e528 struct trace_event_call { ... perf_events = 0xffff0ad5fa89f088, //this value is correct, but x21 = 0 ... } Race Condition Analysis: The race occurs between kprobe activation and perf_events initialization: CPU0 CPU1 ==== ==== perf_kprobe_init perf_trace_event_init tp_event->perf_events = list;(1) tp_event->class->reg (2)← KPROBE ACTIVE Debug exception triggers ... kprobe_dispatcher kprobe_perf_func (tk->tp.flags & TP_FLAG_PROFILE) head = this_cpu_ptr(call->perf_events)(3) (perf_events is still NULL) Problem: 1. CPU0 executes (1) assigning tp_event->perf_events = list 2. CPU0 executes (2) enabling kprobe functionality via class->reg() 3. CPU1 triggers and reaches kprobe_dispatcher 4. CPU1 checks TP_FLAG_PROFILE - condition passes (step 2 completed) 5. CPU1 calls kprobe_perf_func() and crashes at (3) because call->perf_events is still NULL CPU1 sees that kprobe functionality is enabled but does not see that perf_events has been assigned. Add pairing read and write memory barriers to guarantee that if CPU1 sees that kprobe functionality is enabled, it must also see that perf_events has been assigned. Link: https://lore.kernel.org/all/20251001022025.44626-1-chenyuan_fl@163.com/ Fixes: 50d780560785 ("tracing/kprobes: Add probe handler dispatcher to support perf and ftrace concurrent use") Cc: stable@vger.kernel.org Signed-off-by: Yuan Chen Signed-off-by: Masami Hiramatsu (Google) [ Drop fprobe changes + context ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_kprobe.c | 11 +++++++---- kernel/trace/trace_probe.h | 9 +++++++-- kernel/trace/trace_uprobe.c | 12 ++++++++---- 3 files changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e062f4efec8d..03d4ac41d903 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1722,14 +1722,15 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + unsigned int flags = trace_probe_load_flag(&tk->tp); int ret = 0; raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); #endif return ret; @@ -1741,6 +1742,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); struct trace_kprobe *tk; + unsigned int flags; /* * There is a small chance that get_kretprobe(ri) returns NULL when @@ -1753,10 +1755,11 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tk->tp); + if (flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweak kernel, so just return 0 */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 82e1df8aefcb..b08aa3946868 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -258,16 +258,21 @@ struct event_file_link { struct list_head list; }; +static inline unsigned int trace_probe_load_flag(struct trace_probe *tp) +{ + return smp_load_acquire(&tp->event->flags); +} + static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->event->flags & flag); + return !!(trace_probe_load_flag(tp) & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->event->flags |= flag; + smp_store_release(&tp->event->flags, tp->event->flags | flag); } static inline void trace_probe_clear_flag(struct trace_probe *tp, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 322d56661d04..707c5373476a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1485,6 +1485,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb; int dsize, esize; + unsigned int flags; int ret = 0; @@ -1505,11 +1506,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) ucb = uprobe_buffer_get(); store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs, ucb, dsize); #endif uprobe_buffer_put(ucb); @@ -1523,6 +1525,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb; int dsize, esize; + unsigned int flags; tu = container_of(con, struct trace_uprobe, consumer); @@ -1540,11 +1543,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, ucb = uprobe_buffer_get(); store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs, ucb, dsize); #endif uprobe_buffer_put(ucb); -- cgit v1.2.3 From e035ca130ff7f5655f7c63caaeacaf0828f85cce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Oct 2025 15:29:33 +0000 Subject: minmax: make generic MIN() and MAX() macros available everywhere [ Upstream commit 1a251f52cfdc417c84411a056bc142cbd77baef4 ] This just standardizes the use of MIN() and MAX() macros, with the very traditional semantics. The goal is to use these for C constant expressions and for top-level / static initializers, and so be able to simplify the min()/max() macros. These macro names were used by various kernel code - they are very traditional, after all - and all such users have been fixed up, with a few different approaches: - trivial duplicated macro definitions have been removed Note that 'trivial' here means that it's obviously kernel code that already included all the major kernel headers, and thus gets the new generic MIN/MAX macros automatically. - non-trivial duplicated macro definitions are guarded with #ifndef This is the "yes, they define their own versions, but no, the include situation is not entirely obvious, and maybe they don't get the generic version automatically" case. - strange use case #1 A couple of drivers decided that the way they want to describe their versioning is with #define MAJ 1 #define MIN 2 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) which adds zero value and I just did my Alexander the Great impersonation, and rewrote that pointless Gordian knot as #define DRV_VERSION "1.2" instead. - strange use case #2 A couple of drivers thought that it's a good idea to have a random 'MIN' or 'MAX' define for a value or index into a table, rather than the traditional macro that takes arguments. These values were re-written as C enum's instead. The new function-line macros only expand when followed by an open parenthesis, and thus don't clash with enum use. Happily, there weren't really all that many of these cases, and a lot of users already had the pattern of using '#ifndef' guarding (or in one case just using '#undef MIN') before defining their own private version that does the same thing. I left such cases alone. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds Signed-off-by: Eliav Farber Signed-off-by: Greg Kroah-Hartman --- kernel/trace/preemptirq_delay_test.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 8af92dbe98f0..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; -- cgit v1.2.3 From c3b654021931dc806ba086c549e8756c3f204a67 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Sat, 2 Aug 2025 10:21:23 +0800 Subject: pid: Add a judgment for ns null in pid_nr_ns [ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] __task_pid_nr_ns ns = task_active_pid_ns(current); pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); if (pid && ns->level <= pid->level) { Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. For example: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 Mem abort info: ESR = 0x0000000096000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x07: level 3 translation fault Data abort info: ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : __task_pid_nr_ns+0x74/0xd0 lr : __task_pid_nr_ns+0x24/0xd0 sp : ffffffc08001bd10 x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 Call trace: __task_pid_nr_ns+0x74/0xd0 ... __handle_irq_event_percpu+0xd4/0x284 handle_irq_event+0x48/0xb0 handle_fasteoi_irq+0x160/0x2d8 generic_handle_domain_irq+0x44/0x60 gic_handle_irq+0x4c/0x114 call_on_irq_stack+0x3c/0x74 do_interrupt_handler+0x4c/0x84 el1_interrupt+0x34/0x58 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x68/0x6c account_kernel_stack+0x60/0x144 exit_task_stack_account+0x1c/0x80 do_exit+0x7e4/0xaf8 ... get_signal+0x7bc/0x8d8 do_notify_resume+0x128/0x828 el0_svc+0x6c/0x70 el0t_64_sync_handler+0x68/0xbc el0t_64_sync+0x1a8/0x1ac Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Signed-off-by: gaoxiang17 Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com Reviewed-by: Baoquan He Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index efe87db44683..61f6649568b2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -474,7 +474,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; -- cgit v1.2.3 From 15fda76f7a57a0ed319863e45c019d21ca229624 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:16 +0100 Subject: sched/balancing: Rename newidle_balance() => sched_balance_newidle() [ Upstream commit 7d058285cd77cc1411c91efd1b1673530bb1bee8 ] Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-11-mingo@kernel.org Stable-dep-of: 17e3e88ed0b6 ("sched/fair: Fix pelt lost idle time detection") Signed-off-by: Sasha Levin --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea707ee9ddac..03b809308712 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3959,7 +3959,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -4291,7 +4291,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -7280,7 +7280,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (rq->nr_running) return 1; - return newidle_balance(rq, rf) != 0; + return sched_balance_newidle(rq, rf) != 0; } #endif /* CONFIG_SMP */ @@ -7616,10 +7616,10 @@ idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -10427,7 +10427,7 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. @@ -11155,7 +11155,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -11163,7 +11163,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; -- cgit v1.2.3 From fdccb3adc59d70fe43a1a6c2d04bc7ebf3898a51 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 8 Oct 2025 15:12:14 +0200 Subject: sched/fair: Fix pelt lost idle time detection [ Upstream commit 17e3e88ed0b6318fde0d1c14df1a804711cab1b5 ] The check for some lost idle pelt time should be always done when pick_next_task_fair() fails to pick a task and not only when we call it from the fair fast-path. The case happens when the last running task on rq is a RT or DL task. When the latter goes to sleep and the /Sum of util_sum of the rq is at the max value, we don't account the lost of idle time whereas we should. Fixes: 67692435c411 ("sched: Rework pick_next_task() slow-path") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sasha Levin --- kernel/sched/fair.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03b809308712..87f32cf8aa02 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7613,21 +7613,21 @@ done: __maybe_unused; return p; idle: - if (!rf) - return NULL; - - new_tasks = sched_balance_newidle(rq, rf); + if (rf) { + new_tasks = sched_balance_newidle(rq, rf); - /* - * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. - */ - if (new_tasks < 0) - return RETRY_TASK; + /* + * Because sched_balance_newidle() releases (and re-acquires) + * rq->lock, it is possible for any higher priority task to + * appear. In that case we must re-start the pick_next_entity() + * loop. + */ + if (new_tasks < 0) + return RETRY_TASK; - if (new_tasks > 0) - goto again; + if (new_tasks > 0) + goto again; + } /* * rq is about to be idle, check if we need to update the -- cgit v1.2.3 From 6012804a778606e9b83b50dd342ea5558812ed0f Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Mon, 20 Oct 2025 11:39:57 -0400 Subject: padata: Reset next CPU when reorder sequence wraps around [ Upstream commit 501302d5cee0d8e8ec2c4a5919c37e0df9abc99b ] When seq_nr wraps around, the next reorder job with seq 0 is hashed to the first CPU in padata_do_serial(). Correspondingly, need reset pd->cpu to the first one when pd->processed wraps around. Otherwise, if the number of used CPUs is not a power of 2, padata_find_next() will be checking a wrong list, hence deadlock. Fixes: 6fc4dbcf0276 ("padata: Replace delayed timer with immediate workqueue in padata_reorder") Cc: Signed-off-by: Xiao Liang Signed-off-by: Herbert Xu [ moved fix from padata_reorder() to padata_find_next() and adapted cpumask_next_wrap() to 4-argument signature ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/padata.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b443e19e64cf..5453f5750906 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -282,7 +282,11 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); + /* When sequence wraps around, reset to the first CPU. */ + if (unlikely(pd->processed == 0)) + pd->cpu = cpumask_first(pd->cpumask.pcpu); + else + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); } spin_unlock(&reorder->lock); -- cgit v1.2.3