From 11b1b8bc2b98e21ddf47e08b56c21502c685b2c3 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Wed, 6 Mar 2024 10:21:32 +0800 Subject: sched/eevdf: Always update V if se->on_rq when reweighting reweight_eevdf() needs the latest V to do accurate calculation for new ve and vd. So update V unconditionally when se is runnable. Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Suggested-by: Abel Wu Signed-off-by: Tianchen Ding Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Tested-by: K Prateek Nayak Tested-by: Chen Yu Link: https://lore.kernel.org/r/20240306022133.81008-2-dtcccc@linux.alibaba.com --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03be0d1330a6..5551ce2af73e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3790,9 +3790,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (se->on_rq) { /* commit outstanding execution time */ - if (curr) - update_curr(cfs_rq); - else + update_curr(cfs_rq); + if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } -- cgit v1.2.3 From afae8002b4fd3560c8f5f1567f3c3202c30a70fa Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Wed, 6 Mar 2024 10:21:33 +0800 Subject: sched/eevdf: Fix miscalculation in reweight_entity() when se is not curr reweight_eevdf() only keeps V unchanged inside itself. When se != cfs_rq->curr, it would be dequeued from rb tree first. So that V is changed and the result is wrong. Pass the original V to reweight_eevdf() to fix this issue. Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Signed-off-by: Tianchen Ding [peterz: flip if() condition for clarity] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Link: https://lkml.kernel.org/r/20240306022133.81008-3-dtcccc@linux.alibaba.com --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5551ce2af73e..6d266917d38d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3676,11 +3676,10 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif -static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, +static void reweight_eevdf(struct sched_entity *se, u64 avruntime, unsigned long weight) { unsigned long old_weight = se->load.weight; - u64 avruntime = avg_vruntime(cfs_rq); s64 vlag, vslice; /* @@ -3787,24 +3786,26 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; + u64 avruntime; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); + avruntime = avg_vruntime(cfs_rq); if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - if (!se->on_rq) { + if (se->on_rq) { + reweight_eevdf(se, avruntime, weight); + } else { /* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ se->vlag = div_s64(se->vlag * se->load.weight, weight); - } else { - reweight_eevdf(cfs_rq, se, weight); } update_load_set(&se->load, weight); -- cgit v1.2.3 From 1560d1f6eb6b398bddd80c16676776c0325fe5fe Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Mon, 22 Apr 2024 16:22:38 +0800 Subject: sched/eevdf: Prevent vlag from going out of bounds in reweight_eevdf() It was possible to have pick_eevdf() return NULL, which then causes a NULL-deref. This turned out to be due to entity_eligible() returning falsely negative because of a s64 multiplcation overflow. Specifically, reweight_eevdf() computes the vlag without considering the limit placed upon vlag as update_entity_lag() does, and then the scaling multiplication (remember that weight is 20bit fixed point) can overflow. This then leads to the new vruntime being weird which then causes the above entity_eligible() to go side-ways and claim nothing is eligible. Thus limit the range of vlag accordingly. All this was quite rare, but fatal when it does happen. Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/ Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/ Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Reported-by: Sergei Trofimovich Reported-by: Igor Raits Reported-by: Breno Leitao Reported-by: kernel test robot Reported-by: Yujie Liu Signed-off-by: Xuewen Yan Reviewed-and-tested-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240422082238.5784-1-xuewen.yan@unisoc.com --- kernel/sched/fair.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6d266917d38d..c62805dbd608 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * * XXX could add max_slice to the augmented data to track this. */ -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static s64 entity_lag(u64 avruntime, struct sched_entity *se) { - s64 lag, limit; + s64 vlag, limit; + + vlag = avruntime - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + + return clamp(vlag, -limit, limit); +} +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ SCHED_WARN_ON(!se->on_rq); - lag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); - se->vlag = clamp(lag, -limit, limit); + se->vlag = entity_lag(avg_vruntime(cfs_rq), se); } /* @@ -3760,7 +3766,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, * = V - vl' */ if (avruntime != se->vruntime) { - vlag = (s64)(avruntime - se->vruntime); + vlag = entity_lag(avruntime, se); vlag = div_s64(vlag * old_weight, weight); se->vruntime = avruntime - vlag; } -- cgit v1.2.3 From d99e3140a4d33e26066183ff727d8f02f56bec64 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:43 +0000 Subject: mm: turn folio_test_hugetlb into a PageType The current folio_test_hugetlb() can be fooled by a concurrent folio split into returning true for a folio which has never belonged to hugetlbfs. This can't happen if the caller holds a refcount on it, but we have a few places (memory-failure, compaction, procfs) which do not and should not take a speculative reference. Since hugetlb pages do not use individual page mapcounts (they are always fully mapped and use the entire_mapcount field to record the number of mappings), the PageType field is available now that page_mapcount() ignores the value in this field. In compaction and with CONFIG_DEBUG_VM enabled, the current implementation can result in an oops, as reported by Luis. This happens since 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") effectively added some VM_BUG_ON() checks in the PageHuge() testing path. [willy@infradead.org: update vmcoreinfo] Link: https://lkml.kernel.org/r/ZgGZUvsdhaT1Va-T@casper.infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-6-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Reported-by: Luis Chamberlain Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218227 Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- kernel/vmcore_info.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index f95516cd45bb..23c125c2e243 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -205,11 +205,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb) + VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); -- cgit v1.2.3 From fe42754b94a42d08cf9501790afc25c4f6a5f631 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2024 17:05:54 -0700 Subject: cpu: Re-enable CPU mitigations by default for !X86 architectures Rename x86's to CPU_MITIGATIONS, define it in generic code, and force it on for all architectures exception x86. A recent commit to turn mitigations off by default if SPECULATION_MITIGATIONS=n kinda sorta missed that "cpu_mitigations" is completely generic, whereas SPECULATION_MITIGATIONS is x86-specific. Rename x86's SPECULATIVE_MITIGATIONS instead of keeping both and have it select CPU_MITIGATIONS, as having two configs for the same thing is unnecessary and confusing. This will also allow x86 to use the knob to manage mitigations that aren't strictly related to speculative execution. Use another Kconfig to communicate to common code that CPU_MITIGATIONS is already defined instead of having x86's menu depend on the common CPU_MITIGATIONS. This allows keeping a single point of contact for all of x86's mitigations, and it's not clear that other architectures *want* to allow disabling mitigations at compile-time. Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") Closes: https://lkml.kernel.org/r/20240413115324.53303a68%40canb.auug.org.au Reported-by: Stephen Rothwell Reported-by: Michael Ellerman Reported-by: Geert Uytterhoeven Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Josh Poimboeuf Acked-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240420000556.2645001-2-seanjc@google.com --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 07ad53b7f119..bb0ff275fb46 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3207,8 +3207,8 @@ enum cpu_mitigations { }; static enum cpu_mitigations cpu_mitigations __ro_after_init = - IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : - CPU_MITIGATIONS_OFF; + IS_ENABLED(CONFIG_CPU_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : + CPU_MITIGATIONS_OFF; static int __init mitigations_parse_cmdline(char *arg) { -- cgit v1.2.3 From ce0abef6a1d540acef85068e0e82bdf1fbeeb0e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2024 17:05:55 -0700 Subject: cpu: Ignore "mitigations" kernel parameter if CPU_MITIGATIONS=n Explicitly disallow enabling mitigations at runtime for kernels that were built with CONFIG_CPU_MITIGATIONS=n, as some architectures may omit code entirely if mitigations are disabled at compile time. E.g. on x86, a large pile of Kconfigs are buried behind CPU_MITIGATIONS, and trying to provide sane behavior for retroactively enabling mitigations is extremely difficult, bordering on impossible. E.g. page table isolation and call depth tracking require build-time support, BHI mitigations will still be off without additional kernel parameters, etc. [ bp: Touchups. ] Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240420000556.2645001-3-seanjc@google.com --- kernel/cpu.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index bb0ff275fb46..63447eb85dab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3196,6 +3196,7 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } +#ifdef CONFIG_CPU_MITIGATIONS /* * These are used for a global "mitigations=" cmdline option for toggling * optional CPU mitigations. @@ -3206,9 +3207,7 @@ enum cpu_mitigations { CPU_MITIGATIONS_AUTO_NOSMT, }; -static enum cpu_mitigations cpu_mitigations __ro_after_init = - IS_ENABLED(CONFIG_CPU_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : - CPU_MITIGATIONS_OFF; +static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -3224,7 +3223,6 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } -early_param("mitigations", mitigations_parse_cmdline); /* mitigations=off */ bool cpu_mitigations_off(void) @@ -3239,3 +3237,11 @@ bool cpu_mitigations_auto_nosmt(void) return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; } EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); +#else +static int __init mitigations_parse_cmdline(char *arg) +{ + pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n"); + return 0; +} +#endif +early_param("mitigations", mitigations_parse_cmdline); -- cgit v1.2.3 From 2e5449f4f21a1b0bd9beec4c4b580eb1f9b9ed7f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Apr 2024 15:27:58 +0900 Subject: profiling: Remove create_prof_cpu_mask(). create_prof_cpu_mask() is no longer used after commit 1f44a225777e ("s390: convert interrupt handling to use generic hardirq"). Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- kernel/profile.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 8a77769bc4b4..2b775cc5c28f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -344,49 +344,6 @@ void profile_tick(int type) #include #include -static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); - return 0; -} - -static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, prof_cpu_mask_proc_show, NULL); -} - -static ssize_t prof_cpu_mask_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - cpumask_var_t new_value; - int err; - - if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(buffer, count, new_value); - if (!err) { - cpumask_copy(prof_cpu_mask, new_value); - err = count; - } - free_cpumask_var(new_value); - return err; -} - -static const struct proc_ops prof_cpu_mask_proc_ops = { - .proc_open = prof_cpu_mask_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, - .proc_write = prof_cpu_mask_proc_write, -}; - -void create_prof_cpu_mask(void) -{ - /* create /proc/irq/prof_cpu_mask */ - proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops); -} - /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile -- cgit v1.2.3 From 5097cbcb38e6e0d2627c9dde1985e91d2c9f880e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Apr 2024 16:39:05 +0200 Subject: sched/isolation: Prevent boot crash when the boot CPU is nohz_full Documentation/timers/no_hz.rst states that the "nohz_full=" mask must not include the boot CPU, which is no longer true after: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full"). However after: aae17ebb53cd ("workqueue: Avoid using isolated cpus' timers on queue_delayed_work") the kernel will crash at boot time in this case; housekeeping_any_cpu() returns an invalid CPU number until smp_init() brings the first housekeeping CPU up. Change housekeeping_any_cpu() to check the result of cpumask_any_and() and return smp_processor_id() in this case. This is just the simple and backportable workaround which fixes the symptom, but smp_processor_id() at boot time should be safe at least for type == HK_TYPE_TIMER, this more or less matches the tick_do_timer_boot_cpu logic. There is no worry about cpu_down(); tick_nohz_cpu_down() will not allow to offline tick_do_timer_cpu (the 1st online housekeeping CPU). Fixes: aae17ebb53cd ("workqueue: Avoid using isolated cpus' timers on queue_delayed_work") Reported-by: Chris von Recklinghausen Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Phil Auld Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240411143905.GA19288@redhat.com Closes: https://lore.kernel.org/all/20240402105847.GA24832@redhat.com/ --- kernel/sched/isolation.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 373d42c707bc..2a262d3ecb3d 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -46,7 +46,16 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + if (likely(cpu < nr_cpu_ids)) + return cpu; + /* + * Unless we have another problem this can only happen + * at boot time before start_secondary() brings the 1st + * housekeeping CPU up. + */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING || + type != HK_TYPE_TIMER); } } return smp_processor_id(); -- cgit v1.2.3 From 257bf89d84121280904800acd25cc2c444c717ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 13 Apr 2024 16:17:46 +0200 Subject: sched/isolation: Fix boot crash when maxcpus < first housekeeping CPU housekeeping_setup() checks cpumask_intersects(present, online) to ensure that the kernel will have at least one housekeeping CPU after smp_init(), but this doesn't work if the maxcpus= kernel parameter limits the number of processors available after bootup. For example, a kernel with "maxcpus=2 nohz_full=0-2" parameters crashes at boot time on a virtual machine with 4 CPUs. Change housekeeping_setup() to use cpumask_first_and() and check that the returned CPU number is valid and less than setup_max_cpus. Another corner case is "nohz_full=0" on a machine with a single CPU or with the maxcpus=1 kernel argument. In this case non_housekeeping_mask is empty and tick_nohz_full_setup() makes no sense. And indeed, the kernel hits the WARN_ON(tick_nohz_full_running) in tick_sched_do_timer(). And how should the kernel interpret the "nohz_full=" parameter? It should be silently ignored, but currently cpulist_parse() happily returns the empty cpumask and this leads to the same problem. Change housekeeping_setup() to check cpumask_empty(non_housekeeping_mask) and do nothing in this case. Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Phil Auld Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240413141746.GA10008@redhat.com --- kernel/sched/isolation.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 2a262d3ecb3d..5891e715f00d 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -118,6 +118,7 @@ static void __init housekeeping_setup_type(enum hk_type type, static int __init housekeeping_setup(char *str, unsigned long flags) { cpumask_var_t non_housekeeping_mask, housekeeping_staging; + unsigned int first_cpu; int err = 0; if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { @@ -138,7 +139,8 @@ static int __init housekeeping_setup(char *str, unsigned long flags) cpumask_andnot(housekeeping_staging, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { + first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging); + if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) { __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); if (!housekeeping.flags) { @@ -147,6 +149,9 @@ static int __init housekeeping_setup(char *str, unsigned long flags) } } + if (cpumask_empty(non_housekeeping_mask)) + goto free_housekeeping_staging; + if (!housekeeping.flags) { /* First setup call ("nohz_full=" or "isolcpus=") */ enum hk_type type; -- cgit v1.2.3