From ee3e3ac05c2631ce1f12d88c9cc9a092f8fe947a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 22 Nov 2022 15:39:05 -0500 Subject: rseq: Introduce extensible rseq ABI Introduce the extensible rseq ABI, where the feature size supported by the kernel and the required alignment are communicated to user-space through ELF auxiliary vectors. This allows user-space to call rseq registration with a rseq_len of either 32 bytes for the original struct rseq size (which includes padding), or larger. If rseq_len is larger than 32 bytes, then it must be large enough to contain the feature size communicated to user-space through ELF auxiliary vectors. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221122203932.231377-4-mathieu.desnoyers@efficios.com --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 853d08f7562b..e0bc020a63a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1302,6 +1302,7 @@ struct task_struct { #ifdef CONFIG_RSEQ struct rseq __user *rseq; + u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically @@ -2352,10 +2353,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; + t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; + t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } @@ -2364,6 +2367,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; + t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } -- cgit v1.2.3 From af7f588d8f7355bc4298dd1962d7826358fc95f0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 22 Nov 2022 15:39:09 -0500 Subject: sched: Introduce per-memory-map concurrency ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This feature allows the scheduler to expose a per-memory map concurrency ID to user-space. This concurrency ID is within the possible cpus range, and is temporarily (and uniquely) assigned while threads are actively running within a memory map. If a memory map has fewer threads than cores, or is limited to run on few cores concurrently through sched affinity or cgroup cpusets, the concurrency IDs will be values close to 0, thus allowing efficient use of user-space memory for per-cpu data structures. This feature is meant to be exposed by a new rseq thread area field. The primary purpose of this feature is to do the heavy-lifting needed by memory allocators to allow them to use per-cpu data structures efficiently in the following situations: - Single-threaded applications, - Multi-threaded applications on large systems (many cores) with limited cpu affinity mask, - Multi-threaded applications on large systems (many cores) with restricted cgroup cpuset per container. One of the key concern from scheduler maintainers is the overhead associated with additional spin locks or atomic operations in the scheduler fast-path. This is why the following optimization is implemented. On context switch between threads belonging to the same memory map, transfer the mm_cid from prev to next without any atomic ops. This takes care of use-cases involving frequent context switch between threads belonging to the same memory map. Additional optimizations can be done if the spin locks added when context switching between threads belonging to different memory maps end up being a performance bottleneck. Those are left out of this patch though. A performance impact would have to be clearly demonstrated to justify the added complexity. The credit goes to Paul Turner (Google) for the original virtual cpu id idea. This feature is implemented based on the discussions with Paul Turner and Peter Oskolkov (Google), but I took the liberty to implement scheduler fast-path optimizations and my own NUMA-awareness scheme. The rumor has it that Google have been running a rseq vcpu_id extension internally in production for a year. The tcmalloc source code indeed has comments hinting at a vcpu_id prototype extension to the rseq system call [1]. The following benchmarks do not show any significant overhead added to the scheduler context switch by this feature: * perf bench sched messaging (process) Baseline: 86.5±0.3 ms With mm_cid: 86.7±2.6 ms * perf bench sched messaging (threaded) Baseline: 84.3±3.0 ms With mm_cid: 84.7±2.6 ms * hackbench (process) Baseline: 82.9±2.7 ms With mm_cid: 82.9±2.9 ms * hackbench (threaded) Baseline: 85.2±2.6 ms With mm_cid: 84.4±2.9 ms [1] https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/linux_syscall_support.h#L26 Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221122203932.231377-8-mathieu.desnoyers@efficios.com --- include/linux/mm.h | 25 +++++++++++++++++++++++++ include/linux/mm_types.h | 43 ++++++++++++++++++++++++++++++++++++++++++- include/linux/sched.h | 5 +++++ 3 files changed, 72 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f3f196e4d66d..cf008c26a883 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1976,6 +1976,31 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return raw_smp_processor_id(); +} +#endif + #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b8475007734..1c3bf76063d2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -645,7 +645,18 @@ struct mm_struct { * &struct mm_struct is freed. */ atomic_t mm_count; - +#ifdef CONFIG_SCHED_MM_CID + /** + * @cid_lock: Protect cid bitmap updates vs lookups. + * + * Prevent situations where updates to the cid bitmap happen + * concurrently with lookups. Those can lead to situations + * where a lookup cannot find a free bit simply because it was + * unlucky enough to load, non-atomically, bitmap words as they + * were being concurrently updated by the updaters. + */ + raw_spinlock_t cid_lock; +#endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif @@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi, vmi->mas.node = MAS_START; } +#ifdef CONFIG_SCHED_MM_CID +/* Accessor for struct mm_struct's cidmask. */ +static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +{ + unsigned long cid_bitmap = (unsigned long)mm; + + cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); + /* Skip cpu_bitmap */ + cid_bitmap += cpumask_size(); + return (struct cpumask *)cid_bitmap; +} + +static inline void mm_init_cid(struct mm_struct *mm) +{ + raw_spin_lock_init(&mm->cid_lock); + cpumask_clear(mm_cidmask(mm)); +} + +static inline unsigned int mm_cid_size(void) +{ + return cpumask_size(); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_init_cid(struct mm_struct *mm) { } +static inline unsigned int mm_cid_size(void) +{ + return 0; +} +#endif /* CONFIG_SCHED_MM_CID */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/sched.h b/include/linux/sched.h index e0bc020a63a9..4df2b3e76b30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1311,6 +1311,11 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_SCHED_MM_CID + int mm_cid; /* Current cid in mm */ + int mm_cid_active; /* Whether cid bitmap is active */ +#endif + struct tlbflush_unmap_batch tlb_ubc; union { -- cgit v1.2.3 From c89970202a1153b2fc230e89f90c180bd5bcbcef Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 20 Dec 2022 17:07:05 +1000 Subject: cputime: remove cputime_to_nsecs fallback The archs that use cputime_to_nsecs() internally provide their own definition and don't need the fallback. cputime_to_usecs() unused except in this fallback, and is not defined anywhere. This removes the final remnant of the cputime_t code from the kernel. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Gordeev Link: https://lore.kernel.org/r/20221220070705.2958959-1-npiggin@gmail.com --- include/linux/sched/cputime.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index ce3c58286062..5f8fd5b24a2e 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -8,15 +8,6 @@ * cputime accounting APIs: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -#include - -#ifndef cputime_to_nsecs -# define cputime_to_nsecs(__ct) \ - (cputime_to_usecs(__ct) * NSEC_PER_USEC) -#endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); -- cgit v1.2.3 From 28c8e088427ad30b4260953f3b6f908972b77c2d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 4 Jan 2023 14:20:54 -0500 Subject: rseq: Increase AT_VECTOR_SIZE_BASE to match rseq auxvec entries Two new auxiliary vector entries are introduced for rseq without matching increment of the AT_VECTOR_SIZE_BASE, which causes failures with CONFIG_HARDENED_USERCOPY=y. Fixes: 317c8194e6ae ("rseq: Introduce feature size and alignment ELF auxiliary vector entries") Reported-by: Nathan Chancellor Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20230104192054.34046-1-mathieu.desnoyers@efficios.com --- include/linux/auxvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h index f68d0ec2d740..407f7005e6d6 100644 --- a/include/linux/auxvec.h +++ b/include/linux/auxvec.h @@ -4,6 +4,6 @@ #include -#define AT_VECTOR_SIZE_BASE 20 /* NEW_AUX_ENT entries in auxiliary table */ +#define AT_VECTOR_SIZE_BASE 22 /* NEW_AUX_ENT entries in auxiliary table */ /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */ #endif /* _LINUX_AUXVEC_H */ -- cgit v1.2.3 From 0c5ffc3d7b15978c6b184938cd6b8af06e436424 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:26 +0100 Subject: cpuidle, dt: Push RCU-idle into driver Doing RCU-idle outside the driver, only to then temporarily enable it again before going idle is suboptimal. Notably: this converts all dt_init_idle_driver() and __CPU_PM_CPU_IDLE_ENTER() users for they are inextrably intertwined. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195540.068981667@infradead.org --- include/linux/cpuidle.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index fce476275e16..0ddc11e44302 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -289,7 +289,9 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); if (!is_retention) \ __ret = cpu_pm_enter(); \ if (!__ret) { \ + ct_idle_enter(); \ __ret = low_level_idle_enter(state); \ + ct_idle_exit(); \ if (!is_retention) \ cpu_pm_exit(); \ } \ -- cgit v1.2.3 From a01353cf1896ea5b8a7bbc5e2b2d38feed8b7aaa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:27 +0100 Subject: cpuidle: Fix ct_idle_*() usage The whole disable-RCU, enable-IRQS dance is very intricate since changing IRQ state is traced, which depends on RCU. Add two helpers for the cpuidle case that mirror the entry code: ct_cpuidle_enter() ct_cpuidle_exit() And fix all the cases where the enter/exit dance was buggy. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195540.130014793@infradead.org --- include/linux/clockchips.h | 4 ++-- include/linux/cpuidle.h | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 8ae9a95ebf5b..9aac31d856f3 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -211,7 +211,7 @@ extern int tick_receive_broadcast(void); extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); # else -static inline int tick_check_broadcast_expired(void) { return 0; } +static __always_inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } # endif @@ -219,7 +219,7 @@ static inline void tick_setup_hrtimer_broadcast(void) { } static inline void clockevents_suspend(void) { } static inline void clockevents_resume(void) { } -static inline int tick_check_broadcast_expired(void) { return 0; } +static __always_inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 0ddc11e44302..630c879143c7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -14,6 +14,7 @@ #include #include #include +#include #define CPUIDLE_STATE_MAX 10 #define CPUIDLE_NAME_LEN 16 @@ -115,6 +116,35 @@ struct cpuidle_device { DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev); +static __always_inline void ct_cpuidle_enter(void) +{ + lockdep_assert_irqs_disabled(); + /* + * Idle is allowed to (temporary) enable IRQs. It + * will return with IRQs disabled. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * ct_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + ct_idle_enter(); + lockdep_hardirqs_on(_RET_IP_); +} + +static __always_inline void ct_cpuidle_exit(void) +{ + /* + * Carefully undo the above. + */ + lockdep_hardirqs_off(_RET_IP_); + ct_idle_exit(); + instrumentation_begin(); +} + /**************************** * CPUIDLE DRIVER INTERFACE * ****************************/ @@ -289,9 +319,9 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); if (!is_retention) \ __ret = cpu_pm_enter(); \ if (!__ret) { \ - ct_idle_enter(); \ + ct_cpuidle_enter(); \ __ret = low_level_idle_enter(state); \ - ct_idle_exit(); \ + ct_cpuidle_exit(); \ if (!is_retention) \ cpu_pm_exit(); \ } \ -- cgit v1.2.3 From 2b5a0e425e6e319b1978db1e9564f6af4228a567 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:31 +0100 Subject: objtool/idle: Validate __cpuidle code as noinstr Idle code is very like entry code in that RCU isn't available. As such, add a little validation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Geert Uytterhoeven Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195540.373461409@infradead.org --- include/linux/compiler_types.h | 8 ++++++-- include/linux/cpu.h | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 7c1afe0f4129..d7858901f035 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -232,11 +232,15 @@ struct ftrace_likely_data { #endif /* Section for code which can't be instrumented at all */ -#define noinstr \ - noinline notrace __attribute((__section__(".noinstr.text"))) \ +#define __noinstr_section(section) \ + noinline notrace __attribute((__section__(section))) \ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ __no_sanitize_memory +#define noinstr __noinstr_section(".noinstr.text") + +#define __cpuidle __noinstr_section(".cpuidle.text") + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 314802f98b9d..f83e4519c5f0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -176,9 +176,6 @@ void __noreturn cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); -/* Attach to any functions which should be considered cpuidle. */ -#define __cpuidle __section(".cpuidle.text") - bool cpu_in_idle(unsigned long pc); void arch_cpu_idle(void); -- cgit v1.2.3 From e4df1511e1f4bcaa0d590aa7bbffe8bbbd6dfb49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:41 +0100 Subject: cpuidle, sched: Remove instrumentation from TIF_{POLLING_NRFLAG,NEED_RESCHED} objtool pointed out that various idle-TIF management methods have instrumentation: vmlinux.o: warning: objtool: mwait_idle+0x5: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0xc5: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: cpu_idle_poll.isra.0+0x73: call to test_ti_thread_flag() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0xbc: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xea: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0xb4: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0xa6: call to current_clr_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xbf: call to current_clr_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0xa1: call to current_clr_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: mwait_idle+0xe: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0xc5: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: cpu_idle_poll.isra.0+0x73: call to test_ti_thread_flag() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0xbc: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xea: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0xb4: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: cpu_idle_poll.isra.0+0x73: call to test_ti_thread_flag() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0x73: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0x91: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0x78: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_safe_halt+0xf: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section Remove the instrumentation, because these methods are used in low-level cpuidle code moving between states, that should not be instrumented. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195540.988741683@infradead.org --- include/linux/sched/idle.h | 40 ++++++++++++++++++++++++++++++---------- include/linux/thread_info.h | 18 +++++++++++++++++- 2 files changed, 47 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index d73d314d59c6..478084f9105e 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -23,12 +23,37 @@ static inline void wake_up_if_idle(int cpu) { } */ #ifdef TIF_POLLING_NRFLAG -static inline void __current_set_polling(void) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H + +static __always_inline void __current_set_polling(void) { - set_thread_flag(TIF_POLLING_NRFLAG); + arch_set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); } -static inline bool __must_check current_set_polling_and_test(void) +static __always_inline void __current_clr_polling(void) +{ + arch_clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline void __current_set_polling(void) +{ + set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline void __current_clr_polling(void) +{ + clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */ + +static __always_inline bool __must_check current_set_polling_and_test(void) { __current_set_polling(); @@ -41,12 +66,7 @@ static inline bool __must_check current_set_polling_and_test(void) return unlikely(tif_need_resched()); } -static inline void __current_clr_polling(void) -{ - clear_thread_flag(TIF_POLLING_NRFLAG); -} - -static inline bool __must_check current_clr_polling_and_test(void) +static __always_inline bool __must_check current_clr_polling_and_test(void) { __current_clr_polling(); @@ -73,7 +93,7 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif -static inline void current_clr_polling(void) +static __always_inline void current_clr_polling(void) { __current_clr_polling(); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9f392ec76f2b..c02646884fa8 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -177,7 +177,23 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + +static __always_inline bool tif_need_resched(void) +{ + return arch_test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline bool tif_need_resched(void) +{ + return test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, -- cgit v1.2.3 From 6a123d6ae6ea930b9bb3c595ceac2b2f93039f67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:46 +0100 Subject: cpuidle, ACPI: Make noinstr clean objtool found cases where ACPI methods called out into instrumentation code: vmlinux.o: warning: objtool: io_idle+0xc: call to __inb.isra.0() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter+0xfe: call to num_online_cpus() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter+0x115: call to acpi_idle_fallback_to_c1.isra.0() leaves .noinstr.text section Fix this by: marking the IO in/out, acpi_idle_fallback_to_c1() and num_online_cpus() methods as __always_inline. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195541.294846301@infradead.org --- include/linux/cpumask.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c2aa0aa26b45..d45e5de13721 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1017,9 +1017,9 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ -static inline unsigned int num_online_cpus(void) +static __always_inline unsigned int num_online_cpus(void) { - return atomic_read(&__num_online_cpus); + return arch_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) -- cgit v1.2.3 From 408b961146be4c1a776ce285c3c289afab15298a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:48 +0100 Subject: tracing: WARN on rcuidle ARCH_WANTS_NO_INSTR (a superset of CONFIG_GENERIC_ENTRY) disallows any and all tracing when RCU isn't enabled. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195541.416110581@infradead.org --- include/linux/tracepoint.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 4b33b95eb8be..552f80b8362f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -177,6 +177,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ +/* + * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle + * code that disallow any/all tracing/instrumentation when RCU isn't watching. + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define RCUIDLE_COND(rcuidle) (rcuidle) +#else +/* srcu can't be used from NMI */ +#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) +#endif + /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. @@ -188,8 +199,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - /* srcu can't be used from NMI */ \ - WARN_ON_ONCE(rcuidle && in_nmi()); \ + if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \ + return; \ \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ -- cgit v1.2.3 From f176d4ccb30747231831f66779697afa6d738b3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:43:59 +0100 Subject: sched/core: Always inline __this_cpu_preempt_check() Quite a few unnecessary instrumentation calls are generated via the no-op __this_cpu_preempt_check() call, if it gets uninlined by the compiler: vmlinux.o: warning: objtool: in_entry_stack+0x9: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: default_do_nmi+0x10: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: fpu_idle_fpregs+0x41: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: kvm_read_and_reset_apf_flags+0x1: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: lockdep_hardirqs_on+0xb0: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: lockdep_hardirqs_off+0xae: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_nmi_enter+0x69: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_nmi_exit+0x32: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0x9: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter+0x43: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter_s2idle+0x45: call to __this_cpu_preempt_check() leaves .noinstr.text section Mark it __always_inline. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Tony Lindgren Tested-by: Ulf Hansson Acked-by: Rafael J. Wysocki Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230112195542.089981974@infradead.org --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index af1071535de8..e60727be79c4 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -310,7 +310,7 @@ extern void __bad_size_call_parameter(void); #ifdef CONFIG_DEBUG_PREEMPT extern void __this_cpu_preempt_check(const char *op); #else -static inline void __this_cpu_preempt_check(const char *op) { } +static __always_inline void __this_cpu_preempt_check(const char *op) { } #endif #define __pcpu_size_call_return(stem, variable) \ -- cgit v1.2.3 From 0e985e9d22864e29d5d2b3d909ad15134d7f6d46 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Jan 2023 20:44:04 +0100 Subject: cpuidle: Add comments about noinstr/__cpuidle usage Add a few words on noinstr / __cpuidle usage. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230112195542.397238052@infradead.org --- include/linux/compiler_types.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d7858901f035..dea5bf5bd09c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -239,6 +239,16 @@ struct ftrace_likely_data { #define noinstr __noinstr_section(".noinstr.text") +/* + * The __cpuidle section is used twofold: + * + * 1) the original use -- identifying if a CPU is 'stuck' in idle state based + * on it's instruction pointer. See cpu_in_idle(). + * + * 2) supressing instrumentation around where cpuidle disables RCU; where the + * function isn't strictly required for #1, this is interchangeable with + * noinstr. + */ #define __cpuidle __noinstr_section(".cpuidle.text") #endif /* __KERNEL__ */ -- cgit v1.2.3 From 19235e47279894b033a3ec5cf2732de634862b3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Jan 2023 11:26:29 +0100 Subject: cpuidle, arm64: Fix the ARM64 cpuidle logic The recent cpuidle changes started triggering RCU splats on Juno development boards: | ============================= | WARNING: suspicious RCU usage | ----------------------------- | include/trace/events/ipi.h:19 suspicious rcu_dereference_check() usage! Fix cpuidle on ARM64: - ... by introducing a new 'is_rcu' flag to the cpuidle helpers & make ARM64 use it, as ARM64 wants to keep RCU active longer and wants to do the ct_cpuidle_enter()/exit() dance itself. - Also update the PSCI driver accordingly. - This also removes the last known RCU_NONIDLE() user as a bonus. Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Tested-by: Sudeep Holla Tested-by: Mark Rutland Reviewed-by: Mark Rutland Link: https://lore.kernel.org/r/Y8Z31UbzG3LJgAXE@hirez.programming.kicks-ass.net -- --- include/linux/cpuidle.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 630c879143c7..3183aeb7f5b4 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -307,7 +307,7 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); #define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, \ idx, \ state, \ - is_retention) \ + is_retention, is_rcu) \ ({ \ int __ret = 0; \ \ @@ -319,9 +319,11 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); if (!is_retention) \ __ret = cpu_pm_enter(); \ if (!__ret) { \ - ct_cpuidle_enter(); \ + if (!is_rcu) \ + ct_cpuidle_enter(); \ __ret = low_level_idle_enter(state); \ - ct_cpuidle_exit(); \ + if (!is_rcu) \ + ct_cpuidle_exit(); \ if (!is_retention) \ cpu_pm_exit(); \ } \ @@ -330,15 +332,21 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); }) #define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0, 0) #define CPU_PM_CPU_IDLE_ENTER_RETENTION(low_level_idle_enter, idx) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 1) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 1, 0) #define CPU_PM_CPU_IDLE_ENTER_PARAM(low_level_idle_enter, idx, state) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0, 0) + +#define CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(low_level_idle_enter, idx, state) \ + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0, 1) #define CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(low_level_idle_enter, idx, state) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1, 0) + +#define CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(low_level_idle_enter, idx, state) \ + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1, 1) #endif /* _LINUX_CPUIDLE_H */ -- cgit v1.2.3 From 5a5d7e9badd2cb8065db171961bd30bd3595e4b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2023 16:08:31 +0100 Subject: cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG In order to avoid WARN/BUG from generating nested or even recursive warnings, force rcu_is_watching() true during WARN/lockdep_rcu_suspicious(). Notably things like unwinding the stack can trigger rcu_dereference() warnings, which then triggers more unwinding which then triggers more warnings etc.. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230126151323.408156109@infradead.org --- include/linux/context_tracking.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index dcef4a9e4d63..d4afa8508a80 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -130,9 +130,36 @@ static __always_inline unsigned long ct_state_inc(int incby) return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); } +static __always_inline bool warn_rcu_enter(void) +{ + bool ret = false; + + /* + * Horrible hack to shut up recursive RCU isn't watching fail since + * lots of the actual reporting also relies on RCU. + */ + preempt_disable_notrace(); + if (rcu_dynticks_curr_cpu_in_eqs()) { + ret = true; + ct_state_inc(RCU_DYNTICKS_IDX); + } + + return ret; +} + +static __always_inline void warn_rcu_exit(bool rcu) +{ + if (rcu) + ct_state_inc(RCU_DYNTICKS_IDX); + preempt_enable_notrace(); +} + #else static inline void ct_idle_enter(void) { } static inline void ct_idle_exit(void) { } + +static __always_inline bool warn_rcu_enter(void) { return false; } +static __always_inline void warn_rcu_exit(bool rcu) { } #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */ #endif -- cgit v1.2.3 From d099dbfd330686a8c09cd8944bcc77a56f9e7815 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2023 16:08:32 +0100 Subject: cpuidle: tracing: Warn about !rcu_is_watching() When using noinstr, WARN when tracing hits when RCU is disabled. Suggested-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230126151323.466670589@infradead.org --- include/linux/trace_recursion.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index c303f7a114e9..d48cd92d2364 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +# define trace_warn_on_no_rcu(ip) \ + ({ \ + bool __ret = !rcu_is_watching(); \ + if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \ + trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \ + WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \ + trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \ + } \ + __ret; \ + }) +#else +# define trace_warn_on_no_rcu(ip) false +#endif + /* * Preemption is promised to be disabled when return bit >= 0. */ @@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign unsigned int val = READ_ONCE(current->trace_recursion); int bit; + if (trace_warn_on_no_rcu(ip)) + return -1; + bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* -- cgit v1.2.3 From 8739c6811572b087decd561f96382087402cc343 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2023 16:08:36 +0100 Subject: sched/clock/x86: Mark sched_clock() noinstr In order to use sched_clock() from noinstr code, mark it and all it's implenentations noinstr. The whole pvclock thing (used by KVM/Xen) is a bit of a pain, since it calls out to watchdogs, create a pvclock_clocksource_read_nowd() variant doesn't do that and can be noinstr. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230126151323.702003578@infradead.org --- include/linux/math64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index 8958f4c005c1..8b9191a2849e 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } @@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) #else #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret; -- cgit v1.2.3 From 776f22913b8e50011004c6ae43004711dab7efa5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2023 16:08:37 +0100 Subject: sched/clock: Make local_clock() noinstr With sched_clock() noinstr, provide a noinstr implementation of local_clock(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230126151323.760767043@infradead.org --- include/linux/sched/clock.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 867d588314e0..ca008f7d3615 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } -static inline u64 local_clock(void) +static __always_inline u64 local_clock(void) { return sched_clock(); } @@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } -static inline u64 local_clock(void) -{ - return sched_clock_cpu(raw_smp_processor_id()); -} +extern u64 local_clock(void); + #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING -- cgit v1.2.3