From 1c0908d8e441631f5b8ba433523cf39339ee2ba0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Dec 2022 10:02:23 +0000 Subject: rtmutex: Add acquire semantics for rtmutex lock acquisition slow path Jan Kara reported the following bug triggering on 6.0.5-rt14 running dbench on XFS on arm64. kernel BUG at fs/inode.c:625! Internal error: Oops - BUG: 0 [#1] PREEMPT_RT SMP CPU: 11 PID: 6611 Comm: dbench Tainted: G E 6.0.0-rt14-rt+ #1 pc : clear_inode+0xa0/0xc0 lr : clear_inode+0x38/0xc0 Call trace: clear_inode+0xa0/0xc0 evict+0x160/0x180 iput+0x154/0x240 do_unlinkat+0x184/0x300 __arm64_sys_unlinkat+0x48/0xc0 el0_svc_common.constprop.4+0xe4/0x2c0 do_el0_svc+0xac/0x100 el0_svc+0x78/0x200 el0t_64_sync_handler+0x9c/0xc0 el0t_64_sync+0x19c/0x1a0 It also affects 6.1-rc7-rt5 and affects a preempt-rt fork of 5.14 so this is likely a bug that existed forever and only became visible when ARM support was added to preempt-rt. The same problem does not occur on x86-64 and he also reported that converting sb->s_inode_wblist_lock to raw_spinlock_t makes the problem disappear indicating that the RT spinlock variant is the problem. Which in turn means that RT mutexes on ARM64 and any other weakly ordered architecture are affected by this independent of RT. Will Deacon observed: "I'd be more inclined to be suspicious of the slowpath tbh, as we need to make sure that we have acquire semantics on all paths where the lock can be taken. Looking at the rtmutex code, this really isn't obvious to me -- for example, try_to_take_rt_mutex() appears to be able to return via the 'takeit' label without acquire semantics and it looks like we might be relying on the caller's subsequent _unlock_ of the wait_lock for ordering, but that will give us release semantics which aren't correct." Sebastian Andrzej Siewior prototyped a fix that does work based on that comment but it was a little bit overkill and added some fences that should not be necessary. The lock owner is updated with an IRQ-safe raw spinlock held, but the spin_unlock does not provide acquire semantics which are needed when acquiring a mutex. Adds the necessary acquire semantics for lock owner updates in the slow path acquisition and the waiter bit logic. It successfully completed 10 iterations of the dbench workload while the vanilla kernel fails on the first iteration. [ bigeasy@linutronix.de: Initial prototype fix ] Fixes: 700318d1d7b38 ("locking/rtmutex: Use acquire/release semantics") Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core") Reported-by: Jan Kara Signed-off-by: Mel Gorman Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221202100223.6mevpbl7i6x5udfd@techsingularity.net --- kernel/locking/rtmutex.c | 55 ++++++++++++++++++++++++++++++++++++-------- kernel/locking/rtmutex_api.c | 6 ++--- 2 files changed, 49 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7779ee8abc2a..010cf4e6d0b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -89,15 +89,31 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, * set this bit before looking at the lock. */ -static __always_inline void -rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +static __always_inline struct task_struct * +rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - WRITE_ONCE(lock->owner, (struct task_struct *)val); + return (struct task_struct *)val; +} + +static __always_inline void +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +{ + /* + * lock->wait_lock is held but explicit acquire semantics are needed + * for a new lock owner so WRITE_ONCE is insufficient. + */ + xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); +} + +static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) +{ + /* lock->wait_lock is held so the unlock provides release semantics. */ + WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) @@ -106,7 +122,8 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) +static __always_inline void +fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -172,8 +189,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) * still set. */ owner = READ_ONCE(*p); - if (owner & RT_MUTEX_HAS_WAITERS) - WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + if (owner & RT_MUTEX_HAS_WAITERS) { + /* + * See rt_mutex_set_owner() and rt_mutex_clear_owner() on + * why xchg_acquire() is used for updating owner for + * locking and WRITE_ONCE() for unlocking. + * + * WRITE_ONCE() would work for the acquire case too, but + * in case that the lock acquisition failed it might + * force other lockers into the slow path unnecessarily. + */ + if (acquire_lock) + xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); + else + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } } /* @@ -208,6 +238,13 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) owner = *p; } while (cmpxchg_relaxed(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); + + /* + * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE + * operations in the event of contention. Ensure the successful + * cmpxchg is visible. + */ + smp_mb__after_atomic(); } /* @@ -1243,7 +1280,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) * try_to_take_rt_mutex() sets the lock waiters bit * unconditionally. Clean this up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); return ret; } @@ -1604,7 +1641,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit * unconditionally. We might have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); trace_contention_end(lock, ret); @@ -1719,7 +1756,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) * try_to_take_rt_mutex() sets the waiter bit unconditionally. * We might have to fix that up: */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 900220941caa..cb9fdff76a8a 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -267,7 +267,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); + rt_mutex_clear_owner(lock); } /** @@ -382,7 +382,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -438,7 +438,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, false); raw_spin_unlock_irq(&lock->wait_lock); -- cgit v1.2.3 From cc074822465d18a2d39e0b3e2b48b6766a568db2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 17 Dec 2022 14:21:44 +0800 Subject: bpf: Define sock security related BTF IDs under CONFIG_SECURITY_NETWORK There are warnings reported from resolve_btfids when building vmlinux with CONFIG_SECURITY_NETWORK disabled: WARN: resolve_btfids: unresolved symbol bpf_lsm_sk_free_security WARN: resolve_btfids: unresolved symbol bpf_lsm_sk_alloc_security So only define BTF IDs for these LSM hooks when CONFIG_SECURITY_NETWORK is enabled. Fixes: c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted") Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221217062144.2507222-1-houtao@huaweicloud.com --- kernel/bpf/bpf_lsm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9ea42a45da47..a4a41ee3e80b 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -351,8 +351,10 @@ BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) BTF_ID(func, bpf_lsm_bpf_prog_free_security) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_task_free) BTF_SET_END(untrusted_lsm_hooks) -- cgit v1.2.3 From 8374bfd5a3c90a5b250f7c087c4d2b8ac467b12e Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Thu, 22 Dec 2022 10:44:13 +0800 Subject: bpf: fix nullness propagation for reg to reg comparisons After befae75856ab, the verifier would propagate null information after JEQ/JNE, e.g., if two pointers, one is maybe_null and the other is not, the former would be marked as non-null in eq path. However, as comment "PTR_TO_BTF_ID points to a kernel struct that does not need to be null checked by the BPF program ... The verifier must keep this in mind and can make no assumptions about null or non-null when doing branch ...". If one pointer is maybe_null and the other is PTR_TO_BTF, the former is incorrectly marked non-null. The following BPF prog can trigger a null-ptr-deref, also see this report for more details[1]: 0: (18) r1 = map_fd ; R1_w=map_ptr(ks=4, vs=4) 2: (79) r6 = *(u64 *)(r1 +8) ; R6_w=bpf_map->inner_map_data ; R6 is PTR_TO_BTF_ID ; equals to null at runtime 3: (bf) r2 = r10 4: (07) r2 += -4 5: (62) *(u32 *)(r2 +0) = 0 6: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null 7: (1d) if r6 == r0 goto pc+1 8: (95) exit ; from 7 to 9: R0=map_value R6=ptr_bpf_map 9: (61) r0 = *(u32 *)(r0 +0) ; null-ptr-deref 10: (95) exit So, make the verifier propagate nullness information for reg to reg comparisons only if neither reg is PTR_TO_BTF_ID. [1] https://lore.kernel.org/bpf/CACkBjsaFJwjC5oiw-1KXvcazywodwXo4zGYsRHwbr2gSG9WcSw@mail.gmail.com/T/#u Fixes: befae75856ab ("bpf: propagate nullness information for reg to reg comparisons") Signed-off-by: Hao Sun Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221222024414.29539-1-sunhao.th@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a5255a0dcbb6..243d06ce6842 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11822,10 +11822,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * register B - not null * for JNE A, B, ... - A is not null in the false branch; * for JEQ A, B, ... - A is not null in the true branch. + * + * Since PTR_TO_BTF_ID points to a kernel struct that does + * not need to be null checked by the BPF program, i.e., + * could be null even without PTR_MAYBE_NULL marking, so + * only propagate nullness when neither reg is that type. */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && - type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { + type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) && + base_type(src_reg->type) != PTR_TO_BTF_ID && + base_type(dst_reg->type) != PTR_TO_BTF_ID) { eq_branch_regs = NULL; switch (opcode) { case BPF_JEQ: -- cgit v1.2.3 From e2d371484653ac83b970d3ebcf343383f39f8b6b Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 18 Nov 2022 10:45:39 +0530 Subject: perf core: Return error pointer if inherit_event() fails to find pmu_ctx inherit_event() returns NULL only when it finds orphaned events otherwise it returns either valid child_event pointer or an error pointer. Follow the same when it fails to find pmu_ctx. Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: Dan Carpenter Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221118051539.820-1-ravi.bangoria@amd.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eacc3702654d..4bd2434251f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13231,7 +13231,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); - return NULL; + return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; -- cgit v1.2.3 From f841b682baef90ee144df8b12e2c76aa460717c1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Dec 2022 20:40:23 +0800 Subject: perf/core: Fix cgroup events tracking We encounter perf warnings when using cgroup events like: cd /sys/fs/cgroup mkdir test perf stat -e cycles -a -G test Which then triggers: WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0 Call Trace: __schedule+0x4ae/0x9f0 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __cond_resched+0x18/0x20 preempt_schedule_common+0x2d/0x70 __cond_resched+0x18/0x20 wait_for_completion+0x2f/0x160 ? cpu_stop_queue_work+0x9e/0x130 affine_move_task+0x18a/0x4f0 WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0 Call Trace: ? ctx_sched_out+0xb7/0x1b0 perf_cgroup_switch+0x88/0xc0 __schedule+0x4ae/0x9f0 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __cond_resched+0x18/0x20 preempt_schedule_common+0x2d/0x70 __cond_resched+0x18/0x20 wait_for_completion+0x2f/0x160 ? cpu_stop_queue_work+0x9e/0x130 affine_move_task+0x18a/0x4f0 The above two warnings are not complete here since I remove other unimportant information. The problem is caused by the perf cgroup events tracking: CPU0 CPU1 perf_event_open() perf_event_alloc() account_event() account_event_cpu() atomic_inc(perf_cgroup_events) __perf_event_task_sched_out() if (atomic_read(perf_cgroup_events)) perf_cgroup_switch() // kernel/events/core.c:849 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0) if (READ_ONCE(cpuctx->cgrp) == cgrp) // false return perf_ctx_lock() ctx_sched_out() cpuctx->cgrp = cgrp ctx_sched_in() perf_cgroup_set_timestamp() // kernel/events/core.c:829 WARN_ON_ONCE(!ctx->nr_cgroups) perf_ctx_unlock() perf_install_in_context() cpu_function_call() __perf_install_in_context() add_event_to_ctx() list_add_event() perf_cgroup_event_enable() ctx->nr_cgroups++ cpuctx->cgrp = X We can see from above that we wrongly use percpu atomic perf_cgroup_events to check if we need to perf_cgroup_switch(), which should only be used when we know this CPU has cgroup events enabled. The commit bd2756811766 ("perf: Rewrite core context handling") change to have only one context per-CPU, so we can just use cpuctx->cgrp to check if this CPU has cgroup events enabled. So percpu atomic perf_cgroup_events is not needed. Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ravi Bangoria Link: https://lkml.kernel.org/r/20221207124023.66252-1-zhouchengming@bytedance.com --- kernel/events/core.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4bd2434251f0..37c0f04d7a00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -380,7 +380,6 @@ enum event_type_t { /* * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); @@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - cgrp = perf_cgroup_from_task(task, NULL); + /* + * cpuctx->cgrp is set when the first cgroup event enabled, + * and is cleared when the last cgroup event disabled. + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + + cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; @@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(next); + perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r) @@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event) detach_sb_event(event); } -static void unaccount_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_dec(&per_cpu(perf_cgroup_events, cpu)); -} - #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif @@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event) schedule_delayed_work(&perf_sched_work, HZ); } - unaccount_event_cpu(event, event->cpu); - unaccount_pmu_sb_event(event); } @@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event) attach_sb_event(event); } -static void account_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_inc(&per_cpu(perf_cgroup_events, cpu)); -} - /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { @@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event) } enabled: - account_event_cpu(event, event->cpu); - account_pmu_sb_event(event); } @@ -12822,13 +12804,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx, perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, cpu); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); - unaccount_event_cpu(sibling, cpu); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } @@ -12847,7 +12827,6 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, cpu); perf_install_in_context(ctx, event, cpu); } @@ -13742,8 +13721,7 @@ static int __perf_cgroup_move(void *info) struct task_struct *task = info; preempt_disable(); - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(task); + perf_cgroup_switch(task); preempt_enable(); return 0; -- cgit v1.2.3 From a551844e345ba2a1c533dee4b55cb0efddb1bcda Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Dec 2022 15:40:04 +0100 Subject: perf: Fix use-after-free in error path The syscall error path has a use-after-free; put_pmu_ctx() will reference ctx, therefore we must ensure ctx is destroyed after pmu_ctx is. Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: syzbot+b8e8c01c8ade4fe6e48f@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra (Intel) Tested-by: Chengming Zhou Link: https://lkml.kernel.org/r/Y6B3xEgkbmFUCeni@hirez.programming.kicks-ass.net --- kernel/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 37c0f04d7a00..63d674c9b70e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12671,7 +12671,8 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: - /* event->pmu_ctx freed by free_event() */ + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); @@ -12784,6 +12785,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, err_pmu_ctx: put_pmu_ctx(pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); -- cgit v1.2.3 From 0a041ebca4956292cadfb14a63ace3a9c1dcb0a3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Dec 2022 14:31:40 -0800 Subject: perf/core: Call LSM hook after copying perf_event_attr It passes the attr struct to the security_perf_event_open() but it's not initialized yet. Fixes: da97e18458fb ("perf_event: Add support for LSM and SELinux checks") Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221220223140.4020470-1-namhyung@kernel.org --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 63d674c9b70e..d56328e5080e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12321,12 +12321,12 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; - /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; - err = perf_copy_attr(attr_uptr, &attr); + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; -- cgit v1.2.3 From 94cd8fa09f5f1ebdd4e90964b08b7f2cc4b36c43 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 14 Dec 2022 17:20:08 -0500 Subject: futex: Fix futex_waitv() hrtimer debug object leak on kcalloc error In a scenario where kcalloc() fails to allocate memory, the futex_waitv system call immediately returns -ENOMEM without invoking destroy_hrtimer_on_stack(). When CONFIG_DEBUG_OBJECTS_TIMERS=y, this results in leaking a timer debug object. Fixes: bf69bad38cf6 ("futex: Implement sys_futex_waitv()") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Cc: stable@vger.kernel.org Cc: stable@vger.kernel.org # v5.16+ Link: https://lore.kernel.org/r/20221214222008.200393-1-mathieu.desnoyers@efficios.com --- kernel/futex/syscalls.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 086a22d1adb7..a8074079b09e 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -286,19 +286,22 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, } futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); - if (!futexv) - return -ENOMEM; + if (!futexv) { + ret = -ENOMEM; + goto destroy_timer; + } ret = futex_parse_waitv(futexv, waiters, nr_futexes); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); + kfree(futexv); + +destroy_timer: if (timeout) { hrtimer_cancel(&to.timer); destroy_hrtimer_on_stack(&to.timer); } - - kfree(futexv); return ret; } -- cgit v1.2.3 From 9ed1d9aeef5842ecacb660fce933613b58af1e00 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Sat, 24 Dec 2022 21:31:46 +0800 Subject: bpf: Fix panic due to wrong pageattr of im->image In the scenario where livepatch and kretfunc coexist, the pageattr of im->image is rox after arch_prepare_bpf_trampoline in bpf_trampoline_update, and then modify_fentry or register_fentry returns -EAGAIN from bpf_tramp_ftrace_ops_func, the BPF_TRAMP_F_ORIG_STACK flag will be configured, and arch_prepare_bpf_trampoline will be re-executed. At this time, because the pageattr of im->image is rox, arch_prepare_bpf_trampoline will read and write im->image, which causes a fault. as follows: insmod livepatch-sample.ko # samples/livepatch/livepatch-sample.c bpftrace -e 'kretfunc:cmdline_proc_show {}' BUG: unable to handle page fault for address: ffffffffa0206000 PGD 322d067 P4D 322d067 PUD 322e063 PMD 1297e067 PTE d428061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 2 PID: 270 Comm: bpftrace Tainted: G E K 6.1.0 #5 RIP: 0010:arch_prepare_bpf_trampoline+0xed/0x8c0 RSP: 0018:ffffc90001083ad8 EFLAGS: 00010202 RAX: ffffffffa0206000 RBX: 0000000000000020 RCX: 0000000000000000 RDX: ffffffffa0206001 RSI: ffffffffa0206000 RDI: 0000000000000030 RBP: ffffc90001083b70 R08: 0000000000000066 R09: ffff88800f51b400 R10: 000000002e72c6e5 R11: 00000000d0a15080 R12: ffff8880110a68c8 R13: 0000000000000000 R14: ffff88800f51b400 R15: ffffffff814fec10 FS: 00007f87bc0dc780(0000) GS:ffff88803e600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa0206000 CR3: 0000000010b70000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: bpf_trampoline_update+0x25a/0x6b0 __bpf_trampoline_link_prog+0x101/0x240 bpf_trampoline_link_prog+0x2d/0x50 bpf_tracing_prog_attach+0x24c/0x530 bpf_raw_tp_link_attach+0x73/0x1d0 __sys_bpf+0x100e/0x2570 __x64_sys_bpf+0x1c/0x30 do_syscall_64+0x5b/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd With this patch, when modify_fentry or register_fentry returns -EAGAIN from bpf_tramp_ftrace_ops_func, the pageattr of im->image will be reset to nx+rw. Cc: stable@vger.kernel.org Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Signed-off-by: Chuang Wang Acked-by: Jiri Olsa Acked-by: Song Liu Link: https://lore.kernel.org/r/20221224133146.780578-1-nashuiliang@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 11f5ec0b8016..d0ed7d6f5eec 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -488,6 +488,10 @@ again: /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; + + /* reset im->image memory attr for arch_prepare_bpf_trampoline */ + set_memory_nx((long)im->image, 1); + set_memory_rw((long)im->image, 1); goto again; } #endif -- cgit v1.2.3 From 7ff94f276f8ea05df82eb115225e9b26f47a3347 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 16 Dec 2022 14:18:54 -0800 Subject: bpf: keep a reference to the mm, in case the task is dead. Fix the system crash that happens when a task iterator travel through vma of tasks. In task iterators, we used to access mm by following the pointer on the task_struct; however, the death of a task will clear the pointer, even though we still hold the task_struct. That can cause an unexpected crash for a null pointer when an iterator is visiting a task that dies during the visit. Keeping a reference of mm on the iterator ensures we always have a valid pointer to mm. Co-developed-by: Song Liu Signed-off-by: Song Liu Signed-off-by: Kui-Feng Lee Reported-by: Nathan Slingerland Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221216221855.4122288-2-kuifeng@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c2a2182ce570..c4ab9d6cdbe9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -438,6 +438,7 @@ struct bpf_iter_seq_task_vma_info { */ struct bpf_iter_seq_task_common common; struct task_struct *task; + struct mm_struct *mm; struct vm_area_struct *vma; u32 tid; unsigned long prev_vm_start; @@ -456,16 +457,19 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; + struct mm_struct *curr_mm; u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to - * the task_struct, and holds read lock on vma->mm->mmap_lock. + * the task_struct, holds a refcount on mm->mm_users, and holds + * read lock on vma->mm->mmap_lock. * If this function returns NULL, it does not hold any reference or * lock. */ if (info->task) { curr_task = info->task; curr_vma = info->vma; + curr_mm = info->mm; /* In case of lock contention, drop mmap_lock to unblock * the writer. * @@ -504,13 +508,15 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) * 4.2) VMA2 and VMA2' covers different ranges, process * VMA2'. */ - if (mmap_lock_is_contended(curr_task->mm)) { + if (mmap_lock_is_contended(curr_mm)) { info->prev_vm_start = curr_vma->vm_start; info->prev_vm_end = curr_vma->vm_end; op = task_vma_iter_find_vma; - mmap_read_unlock(curr_task->mm); - if (mmap_read_lock_killable(curr_task->mm)) + mmap_read_unlock(curr_mm); + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } else { op = task_vma_iter_next_vma; } @@ -535,42 +541,47 @@ again: op = task_vma_iter_find_vma; } - if (!curr_task->mm) + curr_mm = get_task_mm(curr_task); + if (!curr_mm) goto next_task; - if (mmap_read_lock_killable(curr_task->mm)) + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } switch (op) { case task_vma_iter_first_vma: - curr_vma = find_vma(curr_task->mm, 0); + curr_vma = find_vma(curr_mm, 0); break; case task_vma_iter_next_vma: - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma * to find the next vma. This is similar to the mechanism * in show_smaps_rollup(). */ - curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); /* case 1) and 4.2) above just use curr_vma */ /* check for case 2) or case 4.1) above */ if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; } if (!curr_vma) { /* case 3) above, or case 2) 4.1) with vma->next == NULL */ - mmap_read_unlock(curr_task->mm); + mmap_read_unlock(curr_mm); + mmput(curr_mm); goto next_task; } info->task = curr_task; info->vma = curr_vma; + info->mm = curr_mm; return curr_vma; next_task: @@ -579,6 +590,7 @@ next_task: put_task_struct(curr_task); info->task = NULL; + info->mm = NULL; info->tid++; goto again; @@ -587,6 +599,7 @@ finish: put_task_struct(curr_task); info->task = NULL; info->vma = NULL; + info->mm = NULL; return NULL; } @@ -658,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v) */ info->prev_vm_start = ~0UL; info->prev_vm_end = info->vma->vm_end; - mmap_read_unlock(info->task->mm); + mmap_read_unlock(info->mm); + mmput(info->mm); + info->mm = NULL; put_task_struct(info->task); info->task = NULL; } -- cgit v1.2.3 From 45435d8da71f9f3e6860e6e6ea9667b6ec17ec64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Dec 2022 10:28:44 -0800 Subject: bpf: Always use maximal size for copy_array() Instead of counting on prior allocations to have sized allocations to the next kmalloc bucket size, always perform a krealloc that is at least ksize(dst) in size (which is a no-op), so the size can be correctly tracked by all the various allocation size trackers (KASAN, __alloc_size, etc). Reported-by: Hyunwoo Kim Link: https://lore.kernel.org/bpf/20221223094551.GA1439509@ubuntu Fixes: ceb35b666d42 ("bpf/verifier: Use kmalloc_size_roundup() to match ksize() usage") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: John Fastabend Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: bpf@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221223182836.never.866-kees@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 243d06ce6842..85f96c1e9f62 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1054,6 +1054,8 @@ static void print_insn_state(struct bpf_verifier_env *env, */ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags) { + size_t alloc_bytes; + void *orig = dst; size_t bytes; if (ZERO_OR_NULL_PTR(src)) @@ -1062,11 +1064,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (ksize(dst) < ksize(src)) { - kfree(dst); - dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags); - if (!dst) - return NULL; + alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes)); + dst = krealloc(orig, alloc_bytes, flags); + if (!dst) { + kfree(orig); + return NULL; } memcpy(dst, src, bytes); -- cgit v1.2.3 From 5b24ac2dfd3eb3e36f794af3aa7f2828b19035bd Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Thu, 22 Dec 2022 23:28:21 -0800 Subject: kcsan: test: don't put the expect array on the stack Size of the 'expect' array in the __report_matches is 1536 bytes, which is exactly the default frame size warning limit of the xtensa architecture. As a result allmodconfig xtensa kernel builds with the gcc that does not support the compiler plugins (which otherwise would push the said warning limit to 2K) fail with the following message: kernel/kcsan/kcsan_test.c:257:1: error: the frame size of 1680 bytes is larger than 1536 bytes Fix it by dynamically allocating the 'expect' array. Signed-off-by: Max Filippov Reviewed-by: Marco Elver Tested-by: Marco Elver --- kernel/kcsan/kcsan_test.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index dcec1b743c69..a60c561724be 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -159,7 +159,7 @@ static bool __report_matches(const struct expect_report *r) const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; bool ret = false; unsigned long flags; - typeof(observed.lines) expect; + typeof(*observed.lines) *expect; const char *end; char *cur; int i; @@ -168,6 +168,10 @@ static bool __report_matches(const struct expect_report *r) if (!report_available()) return false; + expect = kmalloc(sizeof(observed.lines), GFP_KERNEL); + if (WARN_ON(!expect)) + return false; + /* Generate expected report contents. */ /* Title */ @@ -253,6 +257,7 @@ static bool __report_matches(const struct expect_report *r) strstr(observed.lines[2], expect[1]))); out: spin_unlock_irqrestore(&observed.lock, flags); + kfree(expect); return ret; } -- cgit v1.2.3 From f3cb80804b8295323919e031281768ba3bf5f8da Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Jan 2023 19:28:49 -0800 Subject: time: Fix various kernel-doc problems Clean up kernel-doc complaints about function names and non-kernel-doc comments in kernel/time/. Fixes these warnings: kernel/time/time.c:479: warning: expecting prototype for set_normalized_timespec(). Prototype was for set_normalized_timespec64() instead kernel/time/time.c:553: warning: expecting prototype for msecs_to_jiffies(). Prototype was for __msecs_to_jiffies() instead kernel/time/timekeeping.c:1595: warning: contents before sections kernel/time/timekeeping.c:1705: warning: This comment starts with '/**', but isn't a kernel-doc comment. * We have three kinds of time sources to use for sleep time kernel/time/timekeeping.c:1726: warning: This comment starts with '/**', but isn't a kernel-doc comment. * 1) can be determined whether to use or not only when doing kernel/time/tick-oneshot.c:21: warning: missing initial short description on line: * tick_program_event kernel/time/tick-oneshot.c:107: warning: expecting prototype for tick_check_oneshot_mode(). Prototype was for tick_oneshot_mode_active() instead Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230103032849.12723-1-rdunlap@infradead.org --- kernel/time/tick-oneshot.c | 4 ++-- kernel/time/time.c | 8 ++++---- kernel/time/timekeeping.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 475ecceda768..5e2c2c26b3cc 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,7 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event - program the CPU local timer device for the next event */ int tick_program_event(ktime_t expires, int force) { @@ -99,7 +99,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) } /** - * tick_check_oneshot_mode - check whether the system is in oneshot mode + * tick_oneshot_mode_active - check whether the system is in oneshot mode * * returns 1 when either nohz or highres are enabled. otherwise 0. */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 526257b3727c..f4198af60fee 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -462,7 +462,7 @@ struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) EXPORT_SYMBOL(ns_to_kernel_old_timeval); /** - * set_normalized_timespec - set timespec sec and nsec parts and normalize + * set_normalized_timespec64 - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set @@ -526,7 +526,7 @@ struct timespec64 ns_to_timespec64(s64 nsec) EXPORT_SYMBOL(ns_to_timespec64); /** - * msecs_to_jiffies: - convert milliseconds to jiffies + * __msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: @@ -541,12 +541,12 @@ EXPORT_SYMBOL(ns_to_timespec64); * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * - * msecs_to_jiffies() checks for the passed in value being a constant + * __msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. - * the _msecs_to_jiffies helpers are the HZ dependent conversion + * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h */ unsigned long __msecs_to_jiffies(const unsigned int m) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f72b9f1de178..5579ead449f2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1590,10 +1590,10 @@ void __weak read_persistent_clock64(struct timespec64 *ts) /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. + * @wall_time: current time as returned by persistent clock + * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. - * @wall_time: - current time as returned by persistent clock - * @boot_offset: - offset that is defined as wall_time - boot_time * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't @@ -1701,7 +1701,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) -/** +/* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource @@ -1722,7 +1722,7 @@ bool timekeeping_rtc_skipresume(void) return !suspend_timing_needed; } -/** +/* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). -- cgit v1.2.3 From 7fb3ff22ad8772bbf0e3ce1ef3eb7b09f431807f Mon Sep 17 00:00:00 2001 From: Yair Podemsky Date: Wed, 30 Nov 2022 14:51:21 +0200 Subject: sched/core: Fix arch_scale_freq_tick() on tickless systems In order for the scheduler to be frequency invariant we measure the ratio between the maximum CPU frequency and the actual CPU frequency. During long tickless periods of time the calculations that keep track of that might overflow, in the function scale_freq_tick(): if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; eventually forcing the kernel to disable the feature for all CPUs, and show the warning message: "Scheduler frequency invariance went wobbly, disabling!". Let's avoid that by limiting the frequency invariant calculations to CPUs with regular tick. Fixes: e2b0d619b400 ("x86, sched: check for counters overflow in frequency invariant accounting") Suggested-by: "Peter Zijlstra (Intel)" Signed-off-by: Yair Podemsky Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Acked-by: Giovanni Gherdovich Link: https://lore.kernel.org/r/20221130125121.34407-1-ypodemsk@redhat.com --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25b582b6ee5f..965d813c28ad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5504,7 +5504,9 @@ void scheduler_tick(void) unsigned long thermal_pressure; u64 resched_latency; - arch_scale_freq_tick(); + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + arch_scale_freq_tick(); + sched_clock_tick(); rq_lock(rq, &rf); -- cgit v1.2.3 From 87ca4f9efbd7cc649ff43b87970888f2812945b8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 30 Dec 2022 23:11:19 -0500 Subject: sched/core: Fix use-after-free bug in dup_user_cpus_ptr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 07ec77a1d4e8 ("sched: Allow task CPU affinity to be restricted on asymmetric systems"), the setting and clearing of user_cpus_ptr are done under pi_lock for arm64 architecture. However, dup_user_cpus_ptr() accesses user_cpus_ptr without any lock protection. Since sched_setaffinity() can be invoked from another process, the process being modified may be undergoing fork() at the same time. When racing with the clearing of user_cpus_ptr in __set_cpus_allowed_ptr_locked(), it can lead to user-after-free and possibly double-free in arm64 kernel. Commit 8f9ea86fdf99 ("sched: Always preserve the user requested cpumask") fixes this problem as user_cpus_ptr, once set, will never be cleared in a task's lifetime. However, this bug was re-introduced in commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()") which allows the clearing of user_cpus_ptr in do_set_cpus_allowed(). This time, it will affect all arches. Fix this bug by always clearing the user_cpus_ptr of the newly cloned/forked task before the copying process starts and check the user_cpus_ptr state of the source task under pi_lock. Note to stable, this patch won't be applicable to stable releases. Just copy the new dup_user_cpus_ptr() function over. Fixes: 07ec77a1d4e8 ("sched: Allow task CPU affinity to be restricted on asymmetric systems") Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()") Reported-by: David Wang 王标 Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221231041120.440785-2-longman@redhat.com --- kernel/sched/core.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 965d813c28ad..f9f6e5413dcf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2612,19 +2612,43 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { + cpumask_t *user_mask; unsigned long flags; - if (!src->user_cpus_ptr) + /* + * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's + * may differ by now due to racing. + */ + dst->user_cpus_ptr = NULL; + + /* + * This check is racy and losing the race is a valid situation. + * It is not worth the extra overhead of taking the pi_lock on + * every fork/clone. + */ + if (data_race(!src->user_cpus_ptr)) return 0; - dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); - if (!dst->user_cpus_ptr) + user_mask = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!user_mask) return -ENOMEM; - /* Use pi_lock to protect content of user_cpus_ptr */ + /* + * Use pi_lock to protect content of user_cpus_ptr + * + * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent + * do_set_cpus_allowed(). + */ raw_spin_lock_irqsave(&src->pi_lock, flags); - cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + if (src->user_cpus_ptr) { + swap(dst->user_cpus_ptr, user_mask); + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + } raw_spin_unlock_irqrestore(&src->pi_lock, flags); + + if (unlikely(user_mask)) + kfree(user_mask); + return 0; } -- cgit v1.2.3 From 9a5418bc48babb313d2a62df29ebe21ce8c06c59 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 30 Dec 2022 23:11:20 -0500 Subject: sched/core: Use kfree_rcu() in do_set_cpus_allowed() Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()") may call kfree() if user_cpus_ptr was previously set. Unfortunately, some of the callers of do_set_cpus_allowed() may have pi_lock held when calling it. So the following splats may be printed especially when running with a PREEMPT_RT kernel: WARNING: possible circular locking dependency detected BUG: sleeping function called from invalid context To avoid these problems, kfree_rcu() is used instead. An internal cpumask_rcuhead union is created for the sole purpose of facilitating the use of kfree_rcu() to free the cpumask. Since user_cpus_ptr is not being used in non-SMP configs, the newly introduced alloc_user_cpus_ptr() helper will return NULL in this case and sched_setaffinity() is modified to handle this special case. Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()") Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra Link: https://lore.kernel.org/r/20221231041120.440785-3-longman@redhat.com --- kernel/sched/core.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f9f6e5413dcf..bb1ee6d7bdde 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2604,9 +2604,29 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) .user_mask = NULL, .flags = SCA_USER, /* clear the user requested mask */ }; + union cpumask_rcuhead { + cpumask_t cpumask; + struct rcu_head rcu; + }; __do_set_cpus_allowed(p, &ac); - kfree(ac.user_mask); + + /* + * Because this is called with p->pi_lock held, it is not possible + * to use kfree() here (when PREEMPT_RT=y), therefore punt to using + * kfree_rcu(). + */ + kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); +} + +static cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); } int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, @@ -2629,7 +2649,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, if (data_race(!src->user_cpus_ptr)) return 0; - user_mask = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + user_mask = alloc_user_cpus_ptr(node); if (!user_mask) return -ENOMEM; @@ -3605,6 +3625,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) return false; } +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + #endif /* !CONFIG_SMP */ static void @@ -8265,8 +8290,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_put_task; - user_mask = kmalloc(cpumask_size(), GFP_KERNEL); - if (!user_mask) { + user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); + if (IS_ENABLED(CONFIG_SMP) && !user_mask) { retval = -ENOMEM; goto out_put_task; } -- cgit v1.2.3 From da35048f2600633a7f9ba5fa7d6e3b1d0195938b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Jan 2023 20:54:26 +1000 Subject: kallsyms: Fix scheduling with interrupts disabled in self-test kallsyms_on_each* may schedule so must not be called with interrupts disabled. The iteration function could disable interrupts, but this also changes lookup_symbol() to match the change to the other timing code. Reported-by: Erhard F. Link: https://lore.kernel.org/all/bug-216902-206035@https.bugzilla.kernel.org%2F/ Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202212251728.8d0872ff-oliver.sang@intel.com Fixes: 30f3bb09778d ("kallsyms: Add self-test facility") Tested-by: "Erhard F." Signed-off-by: Nicholas Piggin Signed-off-by: Luis Chamberlain --- kernel/kallsyms_selftest.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index f35d9cc1aab1..bfbc12da3326 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -157,14 +157,11 @@ static void test_kallsyms_compression_ratio(void) static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr) { u64 t0, t1, t; - unsigned long flags; struct test_stat *stat = (struct test_stat *)data; - local_irq_save(flags); - t0 = sched_clock(); + t0 = ktime_get_ns(); (void)kallsyms_lookup_name(name); - t1 = sched_clock(); - local_irq_restore(flags); + t1 = ktime_get_ns(); t = t1 - t0; if (t < stat->min) @@ -234,18 +231,15 @@ static int find_symbol(void *data, const char *name, struct module *mod, unsigne static void test_perf_kallsyms_on_each_symbol(void) { u64 t0, t1; - unsigned long flags; struct test_stat stat; memset(&stat, 0, sizeof(stat)); stat.max = INT_MAX; stat.name = stub_name; stat.perf = 1; - local_irq_save(flags); - t0 = sched_clock(); + t0 = ktime_get_ns(); kallsyms_on_each_symbol(find_symbol, &stat); - t1 = sched_clock(); - local_irq_restore(flags); + t1 = ktime_get_ns(); pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0); } @@ -270,17 +264,14 @@ static int match_symbol(void *data, unsigned long addr) static void test_perf_kallsyms_on_each_match_symbol(void) { u64 t0, t1; - unsigned long flags; struct test_stat stat; memset(&stat, 0, sizeof(stat)); stat.max = INT_MAX; stat.name = stub_name; - local_irq_save(flags); - t0 = sched_clock(); + t0 = ktime_get_ns(); kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat); - t1 = sched_clock(); - local_irq_restore(flags); + t1 = ktime_get_ns(); pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0); } -- cgit v1.2.3