From 1e66d6cf888fd206a89b8c476b1b28b63faf7fd6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 7 Oct 2024 09:57:01 -0700 Subject: x86/cpu: Fix #define name for Intel CPU model 0x5A This CPU was mistakenly given the name INTEL_ATOM_AIRMONT_MID. But it uses a Silvermont core, not Airmont. Change #define name to INTEL_ATOM_SILVERMONT_MID2 Reported-by: Christian Ludloff Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241007165701.19693-1-tony.luck%40intel.com --- arch/x86/events/intel/core.c | 2 +- arch/x86/include/asm/intel-family.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/tsc_msr.c | 2 +- drivers/powercap/intel_rapl_common.c | 2 +- drivers/staging/media/atomisp/include/linux/atomisp_platform.h | 4 ++-- drivers/thermal/intel/intel_tcc.c | 2 +- tools/power/x86/turbostat/turbostat.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7601196d1d18..89880540ab43 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6622,7 +6622,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_AIRMONT_MID: + case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 6d7b04ffc5fd..8359113e3e58 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -149,9 +149,9 @@ #define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ #define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ #define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */ #define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ -#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ #define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ #define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7cce91b19fb2..76598a93a8fa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1164,7 +1164,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index deeb02825670..48e6cc1cb017 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -152,7 +152,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 77d75e1f14a9..5ccde3982314 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1274,7 +1274,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &rapl_defaults_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2,&rapl_defaults_ann), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), diff --git a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h index 049246774ced..6146555fe9cf 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h @@ -172,10 +172,10 @@ void atomisp_unregister_subdev(struct v4l2_subdev *subdev); #define IS_BYT __IS_SOC(INTEL_ATOM_SILVERMONT) #define IS_CHT __IS_SOC(INTEL_ATOM_AIRMONT) #define IS_MRFD __IS_SOC(INTEL_ATOM_SILVERMONT_MID) -#define IS_MOFD __IS_SOC(INTEL_ATOM_AIRMONT_MID) +#define IS_MOFD __IS_SOC(INTEL_ATOM_SILVERMONT_MID2) /* Both CHT and MOFD come with ISP2401 */ #define IS_ISP2401 __IS_SOCS(INTEL_ATOM_AIRMONT, \ - INTEL_ATOM_AIRMONT_MID) + INTEL_ATOM_SILVERMONT_MID2) #endif /* ATOMISP_PLATFORM_H_ */ diff --git a/drivers/thermal/intel/intel_tcc.c b/drivers/thermal/intel/intel_tcc.c index 817421508d5c..b2a615aea7c1 100644 --- a/drivers/thermal/intel/intel_tcc.c +++ b/drivers/thermal/intel/intel_tcc.c @@ -106,7 +106,7 @@ static const struct x86_cpu_id intel_tcc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &temp_broadwell), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &temp_broadwell), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &temp_goldmont), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &temp_goldmont), diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8d5011a0bf60..26057af6b5a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1056,7 +1056,7 @@ static const struct platform_data turbostat_pdata[] = { * Missing support for * INTEL_ICELAKE * INTEL_ATOM_SILVERMONT_MID - * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_SILVERMONT_MID2 * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, -- cgit v1.2.3 From a7dd183f0b3848c056bbeed78ef5d5c52fe94d83 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:08 +0200 Subject: x86/smp: Allow calling mwait_play_dead with an arbitrary hint Introduce a helper function to allow offlined CPUs to enter idle states with a specific MWAIT hint. The new helper will be used in subsequent patches by the acpi_idle and intel_idle drivers. No functional change intended. Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Reviewed-by: Gautham R. Shenoy Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-2-artem.bityutskiy%40linux.intel.com --- arch/x86/include/asm/smp.h | 3 ++ arch/x86/kernel/smpboot.c | 88 +++++++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ca073f40698f..80f8bfd83fc7 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); +void mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return (struct cpumask *)cpumask_of(0); } + +static inline void mwait_play_dead(unsigned int eax_hint) { } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c10850ae6f09..8aad14e43f54 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1258,47 +1258,9 @@ void play_dead_common(void) local_irq_disable(); } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ -static inline void mwait_play_dead(void) +void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } /* Set up state for the kexec() hack below */ md->status = CPUDEAD_MWAIT_WAIT; @@ -1319,7 +1281,7 @@ static inline void mwait_play_dead(void) mb(); __monitor(md, 0, 0); mb(); - __mwait(eax, 0); + __mwait(eax_hint, 0); if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* @@ -1341,6 +1303,50 @@ static inline void mwait_play_dead(void) } } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead_cpuid_hint(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return; + if (!this_cpu_has(X86_FEATURE_MWAIT)) + return; + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) + return; + + eax = CPUID_LEAF_MWAIT; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + mwait_play_dead(eax); +} + /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). @@ -1391,7 +1397,7 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead(); + mwait_play_dead_cpuid_hint(); if (cpuidle_play_dead()) hlt_play_dead(); } -- cgit v1.2.3 From 541ddf31e30022b8e6f44b3a943964e8f0989d15 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:09 +0200 Subject: ACPI/processor_idle: Add FFH state handling Recent Intel platforms will depend on the idle driver to pass the correct hint for playing dead via mwait_play_dead_with_hint(). Expand the existing enter_dead interface with handling for FFH states and pass the MWAIT hint to the mwait_play_dead code. Suggested-by: Gautham R. Shenoy Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-3-artem.bityutskiy%40linux.intel.com --- arch/x86/kernel/acpi/cstate.c | 10 ++++++++++ drivers/acpi/processor_idle.c | 2 ++ include/acpi/processor.h | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5854f0b8f0f1..5bdb65516969 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Initialize bm_flags based on the CPU cache properties @@ -205,6 +206,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_play_dead(percpu_entry->states[cx->index].eax); +} + void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 698897b29de2..586cc7d1d8aa 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -590,6 +590,8 @@ static void acpi_idle_play_dead(struct cpuidle_device *dev, int index) raw_safe_halt(); else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { io_idle(cx->address); + } else if (cx->entry_method == ACPI_CSTATE_FFH) { + acpi_processor_ffh_play_dead(cx); } else return; } diff --git a/include/acpi/processor.h b/include/acpi/processor.h index a17e97e634a6..63a37e72b721 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -280,6 +280,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct acpi_processor_cx *cx, struct acpi_power_register *reg); void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cstate); +void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx); #else static inline void acpi_processor_power_init_bm_check(struct acpi_processor_flags @@ -300,6 +301,10 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx { return; } +static inline void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + return; +} #endif static inline int call_on_cpu(int cpu, long (*fn)(void *), void *arg, -- cgit v1.2.3 From fc4ca9537bc4e3141ba7e058700369ea242703df Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:10 +0200 Subject: intel_idle: Provide the default enter_dead() handler Recent Intel platforms require idle driver to provide information about the MWAIT hint used to enter the deepest idle state in the play_dead code. Provide the default enter_dead() handler for all of the platforms and allow overwriting with a custom handler for each platform if needed. Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-4-artem.bityutskiy%40linux.intel.com --- drivers/idle/intel_idle.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 118fe1d37c22..e59073efb6fa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -57,6 +57,7 @@ #include #include #include +#include #define INTEL_IDLE_VERSION "0.5.1" @@ -228,6 +229,15 @@ static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev, return 0; } +static void intel_idle_enter_dead(struct cpuidle_device *dev, int index) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + struct cpuidle_state *state = &drv->states[index]; + unsigned long eax = flg2MWAIT(state->flags); + + mwait_play_dead(eax); +} + /* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. @@ -1800,6 +1810,7 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) state->flags |= CPUIDLE_FLAG_TIMER_STOP; state->enter = intel_idle; + state->enter_dead = intel_idle_enter_dead; state->enter_s2idle = intel_idle_s2idle; } } @@ -2149,6 +2160,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) !cpuidle_state_table[cstate].enter_s2idle) break; + if (!cpuidle_state_table[cstate].enter_dead) + cpuidle_state_table[cstate].enter_dead = intel_idle_enter_dead; + /* If marked as unusable, skip this state. */ if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) { pr_debug("state %s is disabled\n", -- cgit v1.2.3 From 96040f7273e2bc0be1871ad9ed4da7b504da9410 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:11 +0200 Subject: x86/smp: Eliminate mwait_play_dead_cpuid_hint() Currently, mwait_play_dead_cpuid_hint() looks up the MWAIT hint of the deepest idle state by inspecting CPUID leaf 0x5 with the assumption that, if the number of sub-states for a given major C-state is nonzero, those sub-states are always represented by consecutive numbers starting from 0. This assumption is not based on the documented platform behavior and in fact it is not met on recent Intel platforms. For example, Intel's Sierra Forest report two C-states with two substates each in cpuid leaf 0x5: Name* target cstate target subcstate (mwait hint) =========================================================== C1 0x00 0x00 C1E 0x00 0x01 -- 0x10 ---- C6S 0x20 0x22 C6P 0x20 0x23 -- 0x30 ---- /* No more (sub)states all the way down to the end. */ =========================================================== * Names of the cstates are not included in the CPUID leaf 0x5, they are taken from the product specific documentation. Notice that hints 0x20 and 0x21 are not defined for C-state 0x20 (C6), so the existing MWAIT hint lookup in mwait_play_dead_cpuid_hint() based on the CPUID leaf 0x5 contents does not work in this case. Instead of using MWAIT hint lookup that is not guaranteed to work, make native_play_dead() rely on the idle driver for the given platform to put CPUs going offline into appropriate idle state and, if that fails, fall back to hlt_play_dead(). Accordingly, drop mwait_play_dead_cpuid_hint() altogether and make native_play_dead() call cpuidle_play_dead() instead of it unconditionally with the assumption that it will not return if it is successful. Still, in case cpuidle_play_dead() fails, call hlt_play_dead() at the end. Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Reviewed-by: Gautham R. Shenoy Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-5-artem.bityutskiy%40linux.intel.com --- arch/x86/kernel/smpboot.c | 54 ++++++----------------------------------------- 1 file changed, 7 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8aad14e43f54..5746084bafe4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1258,6 +1258,10 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); @@ -1303,50 +1307,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint) } } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ -static inline void mwait_play_dead_cpuid_hint(void) -{ - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } - - mwait_play_dead(eax); -} - /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). @@ -1397,9 +1357,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead_cpuid_hint(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From a3e8fe814ad15c16735cdf394454a8bd96eb4d56 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:33 -0500 Subject: x86/build: Raise the minimum GCC version to 8.1 Stack protector support on 64-bit currently requires that the percpu section is linked at absolute address 0, because older compilers fixed the location of the canary value relative to the GS segment base. GCC 8.1 introduced options to change where the canary value is located, allowing it to be configured as a standard per-CPU variable. This has already been done for 32-bit. Doing the same for 64-bit will enable removing the code needed to support zero-based percpu. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-2-brgerst@gmail.com --- scripts/min-tool-version.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 91c91201212c..06c4e410ecab 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -19,6 +19,8 @@ binutils) gcc) if [ "$ARCH" = parisc64 ]; then echo 12.0.0 + elif [ "$SRCARCH" = x86 ]; then + echo 8.1.0 else echo 5.1.0 fi -- cgit v1.2.3 From 0ee2689b9374d6fd5f43b703713a532278654749 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:34 -0500 Subject: x86/stackprotector: Remove stack protector test scripts With GCC 8.1 now the minimum supported compiler for x86, these scripts are no longer needed. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Reviewed-by: Uros Bizjak Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-3-brgerst@gmail.com --- arch/x86/Kconfig | 11 +---------- scripts/gcc-x86_32-has-stack-protector.sh | 8 -------- scripts/gcc-x86_64-has-stack-protector.sh | 4 ---- 3 files changed, 1 insertion(+), 22 deletions(-) delete mode 100755 scripts/gcc-x86_32-has-stack-protector.sh delete mode 100755 scripts/gcc-x86_64-has-stack-protector.sh diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be2c311f5118..6595b35dd52d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -285,7 +285,7 @@ config X86 select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK - select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR + select HAVE_STACKPROTECTOR select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL @@ -426,15 +426,6 @@ config PGTABLE_LEVELS default 3 if X86_PAE default 2 -config CC_HAS_SANE_STACKPROTECTOR - bool - default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) if 64BIT - default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) - help - We have to make sure stack protector is unconditionally disabled if - the compiler produces broken code or if it does not let us control - the segment on 32-bit kernels. - menu "Processor type and features" config SMP diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh deleted file mode 100755 index 9459ca4f0f11..000000000000 --- a/scripts/gcc-x86_32-has-stack-protector.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# This requires GCC 8.1 or better. Specifically, we require -# -mstack-protector-guard-reg, added by -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708 - -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh deleted file mode 100755 index f680bb01aeeb..000000000000 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" -- cgit v1.2.3 From a9a76b38aaf577887103e3ebb41d70e6aa5a4b19 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:35 -0500 Subject: x86/boot: Disable stack protector for early boot code On 64-bit, this will prevent crashes when the canary access is changed from %gs:40 to %gs:__stack_chk_guard(%rip). RIP-relative addresses from the identity-mapped early boot code will target the wrong address with zero-based percpu. KASLR could then shift that address to an unmapped page causing a crash on boot. This early boot code runs well before user-space is active and does not need stack protector enabled. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-4-brgerst@gmail.com --- arch/x86/kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b43eb7e384eb..84cfa179802c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,8 @@ KCOV_INSTRUMENT_unwind_orc.o := n KCOV_INSTRUMENT_unwind_frame.o := n KCOV_INSTRUMENT_unwind_guess.o := n +CFLAGS_head32.o := -fno-stack-protector +CFLAGS_head64.o := -fno-stack-protector CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o -- cgit v1.2.3 From f58b63857ae38b4484185b799a2759274b930c92 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:36 -0500 Subject: x86/pvh: Use fixed_percpu_data for early boot GSBASE Instead of having a private area for the stack canary, use fixed_percpu_data for GSBASE like the native kernel. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-5-brgerst@gmail.com --- arch/x86/platform/pvh/head.S | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 4733a5f467b8..723f181b222a 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -173,10 +173,15 @@ SYM_CODE_START(pvh_start_xen) 1: UNWIND_HINT_END_OF_STACK - /* Set base address in stack canary descriptor. */ - mov $MSR_GS_BASE,%ecx - leal canary(%rip), %eax - xor %edx, %edx + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ + movl $MSR_GS_BASE,%ecx + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx + movq %edx, %eax + shrq $32, %rdx wrmsr /* Call xen_prepare_pvh() via the kernel virtual mapping */ @@ -238,8 +243,6 @@ SYM_DATA_START_LOCAL(gdt_start) SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 -SYM_DATA_LOCAL(canary, .fill 48, 1, 0) - SYM_DATA_START_LOCAL(early_stack) .fill BOOT_STACK_SIZE, 1, 0 SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end) -- cgit v1.2.3 From cb7927fda002ca49ae62e2782c1692acc7b80c67 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:37 -0500 Subject: x86/relocs: Handle R_X86_64_REX_GOTPCRELX relocations Clang may produce R_X86_64_REX_GOTPCRELX relocations when redefining the stack protector location. Treat them as another type of PC-relative relocation. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-6-brgerst@gmail.com --- arch/x86/tools/relocs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index e937be979ec8..92a1e503305e 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -32,6 +32,11 @@ static struct relocs relocs32; static struct relocs relocs32neg; static struct relocs relocs64; # define FMT PRIu64 + +#ifndef R_X86_64_REX_GOTPCRELX +# define R_X86_64_REX_GOTPCRELX 42 +#endif + #else # define FMT PRIu32 #endif @@ -227,6 +232,7 @@ static const char *rel_type(unsigned type) REL_TYPE(R_X86_64_PC16), REL_TYPE(R_X86_64_8), REL_TYPE(R_X86_64_PC8), + REL_TYPE(R_X86_64_REX_GOTPCRELX), #else REL_TYPE(R_386_NONE), REL_TYPE(R_386_32), @@ -861,6 +867,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, case R_X86_64_PC32: case R_X86_64_PLT32: + case R_X86_64_REX_GOTPCRELX: /* * PC relative relocations don't need to be adjusted unless * referencing a percpu symbol. -- cgit v1.2.3 From 78c4374ef8b842c6abf195d6f963853c7ec464d2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 23 Jan 2025 14:07:38 -0500 Subject: x86/module: Deal with GOT based stack cookie load on Clang < 17 Clang versions before 17 will not honour -fdirect-access-external-data for the load of the stack cookie emitted into each function's prologue and epilogue. This is not an issue for the core kernel, as the linker will relax these loads into LEA instructions that take the address of __stack_chk_guard directly. For modules, however, we need to work around this, by dealing with R_X86_64_REX_GOTPCRELX relocations that refer to __stack_chk_guard. In this case, given that this is a GOT load, the reference should not refer to __stack_chk_guard directly, but to a memory location that holds its address. So take the address of __stack_chk_guard into a static variable, and fix up the relocations to refer to that. [ mingo: Fix broken R_X86_64_GOTPCRELX definition. ] Signed-off-by: Ard Biesheuvel Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-7-brgerst@gmail.com --- arch/x86/include/asm/elf.h | 5 +++-- arch/x86/kernel/module.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1fb83d47711f..128602612eca 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -54,8 +54,9 @@ typedef struct user_i387_struct elf_fpregset_t; #define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ #define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ #define R_X86_64_RELATIVE 8 /* Adjust by program base */ -#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative - offset to GOT */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */ +#define R_X86_64_GOTPCRELX 41 +#define R_X86_64_REX_GOTPCRELX 42 #define R_X86_64_32 10 /* Direct 32 bit zero extended */ #define R_X86_64_32S 11 /* Direct 32 bit sign extended */ #define R_X86_64_16 12 /* Direct 16 bit zero extended */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8984abd91c00..a286f32c5503 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -130,6 +131,20 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, goto overflow; size = 4; break; +#if defined(CONFIG_STACKPROTECTOR) && \ + defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 + case R_X86_64_REX_GOTPCRELX: { + static unsigned long __percpu *const addr = &__stack_chk_guard; + + if (sym->st_value != (u64)addr) { + pr_err("%s: Unsupported GOTPCREL relocation\n", me->name); + return -ENOEXEC; + } + + val = (u64)&addr + rel[i].r_addend; + fallthrough; + } +#endif case R_X86_64_PC32: case R_X86_64_PLT32: val -= (u64)loc; -- cgit v1.2.3 From 80d47defddc000271502057ebd7efa4fd6481542 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:39 -0500 Subject: x86/stackprotector/64: Convert to normal per-CPU variable Older versions of GCC fixed the location of the stack protector canary at %gs:40. This constraint forced the percpu section to be linked at absolute address 0 so that the canary could be the first data object in the percpu section. Supporting the zero-based percpu section requires additional code to handle relocations for RIP-relative references to percpu data, extra complexity to kallsyms, and workarounds for linker bugs due to the use of absolute symbols. GCC 8.1 supports redefining where the canary is located, allowing it to become a normal percpu variable instead of at a fixed location. This removes the constraint that the percpu section must be zero-based. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Reviewed-by: Uros Bizjak Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-8-brgerst@gmail.com --- arch/x86/Makefile | 20 +++++++++++-------- arch/x86/entry/entry.S | 2 -- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/processor.h | 16 ++-------------- arch/x86/include/asm/stackprotector.h | 36 +++++------------------------------ arch/x86/kernel/asm-offsets_64.c | 6 ------ arch/x86/kernel/cpu/common.c | 5 +---- arch/x86/kernel/head_64.S | 3 +-- arch/x86/xen/xen-head.S | 3 +-- 9 files changed, 23 insertions(+), 70 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b773b34768d..88a1705366f9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -140,14 +140,7 @@ ifeq ($(CONFIG_X86_32),y) # temporary until string.h is fixed KBUILD_CFLAGS += -ffreestanding - ifeq ($(CONFIG_STACKPROTECTOR),y) - ifeq ($(CONFIG_SMP),y) - KBUILD_CFLAGS += -mstack-protector-guard-reg=fs \ - -mstack-protector-guard-symbol=__ref_stack_chk_guard - else - KBUILD_CFLAGS += -mstack-protector-guard=global - endif - endif + percpu_seg := fs else BITS := 64 UTS_MACHINE := x86_64 @@ -197,6 +190,17 @@ else KBUILD_CFLAGS += -mcmodel=kernel KBUILD_RUSTFLAGS += -Cno-redzone=y KBUILD_RUSTFLAGS += -Ccode-model=kernel + + percpu_seg := gs +endif + +ifeq ($(CONFIG_STACKPROTECTOR),y) + ifeq ($(CONFIG_SMP),y) + KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg) + KBUILD_CFLAGS += -mstack-protector-guard-symbol=__ref_stack_chk_guard + else + KBUILD_CFLAGS += -mstack-protector-guard=global + endif endif # diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index b7ea3e8e9ecc..fe5344a249a1 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -52,7 +52,6 @@ EXPORT_SYMBOL_GPL(mds_verw_sel); THUNK warn_thunk_thunk, __warn_thunk -#ifndef CONFIG_X86_64 /* * Clang's implementation of TLS stack cookies requires the variable in * question to be a TLS variable. If the variable happens to be defined as an @@ -66,4 +65,3 @@ THUNK warn_thunk_thunk, __warn_thunk #ifdef CONFIG_STACKPROTECTOR EXPORT_SYMBOL(__ref_stack_chk_guard); #endif -#endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f52dbe0ad93c..33a955aa01d8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -192,7 +192,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) + movq %rbx, PER_CPU_VAR(__stack_chk_guard) #endif /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0cd10182e90..a4687122951f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -422,16 +422,8 @@ struct irq_stack { #ifdef CONFIG_X86_64 struct fixed_percpu_data { - /* - * GCC hardcodes the stack canary as %gs:40. Since the - * irq_stack is the object at %gs:0, we reserve the bottom - * 48 bytes of the irq stack for the canary. - * - * Once we are willing to require -mstack-protector-guard-symbol= - * support for x86_64 stackprotector, we can get rid of this. - */ char gs_base[40]; - unsigned long stack_canary; + unsigned long reserved; }; DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; @@ -446,11 +438,7 @@ extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); -#else /* X86_64 */ -#ifdef CONFIG_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, __stack_chk_guard); -#endif -#endif /* !X86_64 */ +#endif /* X86_64 */ struct perf_event; diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 00473a650f51..d43fb589fcf6 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -2,26 +2,10 @@ /* * GCC stack protector support. * - * Stack protector works by putting predefined pattern at the start of + * Stack protector works by putting a predefined pattern at the start of * the stack frame and verifying that it hasn't been overwritten when - * returning from the function. The pattern is called stack canary - * and unfortunately gcc historically required it to be at a fixed offset - * from the percpu segment base. On x86_64, the offset is 40 bytes. - * - * The same segment is shared by percpu area and stack canary. On - * x86_64, percpu symbols are zero based and %gs (64-bit) points to the - * base of percpu area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at the appropriate - * offset. On x86_32, the stack canary is just a regular percpu - * variable. - * - * Putting percpu data in %fs on 32-bit is a minor optimization compared to - * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely - * to load 0 into %fs on exit to usermode, whereas with percpu data in - * %gs, we are likely to load a non-null %gs on return to user mode. - * - * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector - * support, we can remove some of this complexity. + * returning from the function. The pattern is called the stack canary + * and is a unique value for each task. */ #ifndef _ASM_STACKPROTECTOR_H @@ -36,6 +20,8 @@ #include +DECLARE_PER_CPU(unsigned long, __stack_chk_guard); + /* * Initialize the stackprotector canary value. * @@ -51,25 +37,13 @@ static __always_inline void boot_init_stack_canary(void) { unsigned long canary = get_random_canary(); -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); -#endif - current->stack_canary = canary; -#ifdef CONFIG_X86_64 - this_cpu_write(fixed_percpu_data.stack_canary, canary); -#else this_cpu_write(__stack_chk_guard, canary); -#endif } static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) { -#ifdef CONFIG_X86_64 - per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; -#else per_cpu(__stack_chk_guard, cpu) = idle->stack_canary; -#endif } #else /* STACKPROTECTOR */ diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bb65371ea9df..590b6cd0eac0 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -54,11 +54,5 @@ int main(void) BLANK(); #undef ENTRY - BLANK(); - -#ifdef CONFIG_STACKPROTECTOR - OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); - BLANK(); -#endif return 0; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7cce91b19fb2..b71178f0ed6c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2089,8 +2089,7 @@ void syscall_init(void) if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_syscall_init(); } - -#else /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); @@ -2099,8 +2098,6 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -#endif /* CONFIG_X86_64 */ - /* * Clear all 6 debug registers: */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 31345e0ba006..c3d73c04603f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -361,8 +361,7 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) /* Set up %gs. * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + * The base of %gs always points to fixed_percpu_data. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 894edf8d6d62..a31b057e641e 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -33,8 +33,7 @@ SYM_CODE_START(startup_xen) /* Set up %gs. * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + * The base of %gs always points to fixed_percpu_data. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ -- cgit v1.2.3 From 9d7de2aa8b41407bc96d89a80dc1fd637d389d42 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:40 -0500 Subject: x86/percpu/64: Use relative percpu offsets The percpu section is currently linked at absolute address 0, because older compilers hard-coded the stack protector canary value at a fixed offset from the start of the GS segment. Now that the canary is a normal percpu variable, the percpu section does not need to be linked at a specific address. x86-64 will now calculate the percpu offsets as the delta between the initial percpu address and the dynamically allocated memory, like other architectures. Note that GSBASE is limited to the canonical address width (48 or 57 bits, sign-extended). As long as the kernel text, modules, and the dynamically allocated percpu memory are all in the negative address space, the delta will not overflow this limit. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Reviewed-by: Uros Bizjak Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-9-brgerst@gmail.com --- arch/x86/include/asm/processor.h | 6 +++++- arch/x86/kernel/head_64.S | 19 +++++++++---------- arch/x86/kernel/setup_percpu.c | 12 ++---------- arch/x86/kernel/vmlinux.lds.S | 29 +---------------------------- arch/x86/platform/pvh/head.S | 5 ++--- arch/x86/tools/relocs.c | 10 +++------- arch/x86/xen/xen-head.S | 9 ++++----- init/Kconfig | 2 +- 8 files changed, 27 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a4687122951f..b8fee88dac3d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -431,7 +431,11 @@ DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); +#ifdef CONFIG_SMP + return per_cpu_offset(cpu); +#else + return 0; +#endif } extern asmlinkage void entry_SYSCALL32_ignore(void); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index c3d73c04603f..2843b0a56198 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -61,11 +61,14 @@ SYM_CODE_START_NOALIGN(startup_64) /* Set up the stack for verify_cpu() */ leaq __top_init_kernel_stack(%rip), %rsp - /* Setup GSBASE to allow stack canary access for C code */ + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ movl $MSR_GS_BASE, %ecx - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx - movl %edx, %eax - shrq $32, %rdx + xorl %eax, %eax + xorl %edx, %edx wrmsr call startup_64_setup_gdt_idt @@ -359,16 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) movl %eax,%fs movl %eax,%gs - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx -#ifndef CONFIG_SMP - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx -#endif movl %edx, %eax shrq $32, %rdx wrmsr diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b30d6e180df7..1e7be9409aa2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -23,18 +23,10 @@ #include #include -#ifdef CONFIG_X86_64 -#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) -#else -#define BOOT_PERCPU_OFFSET 0 -#endif - -DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { - [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, -}; +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init; EXPORT_SYMBOL(__per_cpu_offset); /* diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0deb4887d6e9..8a598515239a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -112,12 +112,6 @@ ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ -#ifdef CONFIG_X86_64 -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(6); /* RW_ */ -#endif - init PT_LOAD FLAGS(7); /* RWE */ -#endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -216,21 +210,7 @@ SECTIONS __init_begin = .; /* paired with __init_end */ } -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - .init.text - should - * start another segment - init. - */ - PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) - ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, - "per-CPU data too large - increase CONFIG_PHYSICAL_START") -#endif - INIT_TEXT_SECTION(PAGE_SIZE) -#ifdef CONFIG_X86_64 - :init -#endif /* * Section for code used exclusively before alternatives are run. All @@ -347,9 +327,7 @@ SECTIONS EXIT_DATA } -#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU_SECTION(INTERNODE_CACHE_BYTES) -#endif RUNTIME_CONST_VARIABLES RUNTIME_CONST(ptr, USER_PTR_MAX) @@ -497,16 +475,11 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) INIT_PER_CPU(gdt_page); INIT_PER_CPU(fixed_percpu_data); INIT_PER_CPU(irq_stack_backing_store); -#ifdef CONFIG_SMP -. = ASSERT((fixed_percpu_data == 0), - "fixed_percpu_data is not at start of per-cpu area"); -#endif - #ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); #endif diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 723f181b222a..cfa18ec7d55f 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -179,9 +179,8 @@ SYM_CODE_START(pvh_start_xen) * the per-CPU areas are set up. */ movl $MSR_GS_BASE,%ecx - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx - movq %edx, %eax - shrq $32, %rdx + xorl %eax, %eax + xorl %edx, %edx wrmsr /* Call xen_prepare_pvh() via the kernel virtual mapping */ diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 92a1e503305e..3cb3b30b6706 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -835,12 +835,7 @@ static void percpu_init(void) */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) { - int shndx = sym_index(sym); - - return (shndx == per_cpu_shndx) && - strcmp(symname, "__init_begin") && - strcmp(symname, "__per_cpu_load") && - strncmp(symname, "init_per_cpu_", 13); + return 0; } @@ -1062,7 +1057,8 @@ static int cmp_relocs(const void *va, const void *vb) static void sort_relocs(struct relocs *r) { - qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); + if (r->count) + qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } static int write32(uint32_t v, FILE *f) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a31b057e641e..5ccb4c54e241 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -31,15 +31,14 @@ SYM_CODE_START(startup_xen) leaq __top_init_kernel_stack(%rip), %rsp - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax - cdq + xorl %eax, %eax + xorl %edx, %edx wrmsr mov %rsi, %rdi diff --git a/init/Kconfig b/init/Kconfig index d0d021b3fa3b..b5d9c0fa69f6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1872,7 +1872,7 @@ config KALLSYMS_ALL config KALLSYMS_ABSOLUTE_PERCPU bool depends on KALLSYMS - default X86_64 && SMP + default n # end of the "standard kernel features (expert users)" menu -- cgit v1.2.3 From b5c4f95351a097a635c1a7fc8d9efa18308491b5 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:41 -0500 Subject: x86/percpu/64: Remove fixed_percpu_data Now that the stack protector canary value is a normal percpu variable, fixed_percpu_data is unused and can be removed. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Reviewed-by: Uros Bizjak Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-10-brgerst@gmail.com --- arch/x86/include/asm/processor.h | 8 -------- arch/x86/kernel/cpu/common.c | 4 ---- arch/x86/kernel/vmlinux.lds.S | 1 - arch/x86/tools/relocs.c | 1 - 4 files changed, 14 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b8fee88dac3d..b3d153730f63 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -421,14 +421,6 @@ struct irq_stack { } __aligned(IRQ_STACK_SIZE); #ifdef CONFIG_X86_64 -struct fixed_percpu_data { - char gs_base[40]; - unsigned long reserved; -}; - -DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; -DECLARE_INIT_PER_CPU(fixed_percpu_data); - static inline unsigned long cpu_kernelmode_gs_base(int cpu) { #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b71178f0ed6c..8b49b1338f76 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2023,10 +2023,6 @@ EXPORT_PER_CPU_SYMBOL(pcpu_hot); EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, - fixed_percpu_data) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); - static void wrmsrl_cstar(unsigned long val) { /* diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8a598515239a..93c2fa8a7522 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -477,7 +477,6 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) INIT_PER_CPU(gdt_page); -INIT_PER_CPU(fixed_percpu_data); INIT_PER_CPU(irq_stack_backing_store); #ifdef CONFIG_MITIGATION_UNRET_ENTRY diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 3cb3b30b6706..b5e3695a0615 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -830,7 +830,6 @@ static void percpu_init(void) * __per_cpu_load * * The "gold" linker incorrectly associates: - * init_per_cpu__fixed_percpu_data * init_per_cpu__gdt_page */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) -- cgit v1.2.3 From a8327be7b2aa067ff2b11551732d5bd8b49ef7d1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:42 -0500 Subject: x86/boot/64: Remove inverse relocations Inverse relocations were needed to offset the effects of relocation for RIP-relative accesses to zero-based percpu data. Now that the percpu section is linked normally as part of the kernel image, they are no longer needed. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-11-brgerst@gmail.com --- arch/x86/boot/compressed/misc.c | 14 +---- arch/x86/tools/relocs.c | 130 +--------------------------------------- 2 files changed, 2 insertions(+), 142 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0d37420cad02..1cdcd4aaf395 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -235,7 +235,7 @@ static void handle_relocations(void *output, unsigned long output_len, /* * Process relocations: 32 bit relocations first then 64 bit after. - * Three sets of binary relocations are added to the end of the kernel + * Two sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel * address of the location which needs to be updated stored as a * 32-bit value which is sign extended to 64 bits. @@ -245,8 +245,6 @@ static void handle_relocations(void *output, unsigned long output_len, * kernel bits... * 0 - zero terminator for 64 bit relocations * 64 bit relocation repeated - * 0 - zero terminator for inverse 32 bit relocations - * 32 bit inverse relocation repeated * 0 - zero terminator for 32 bit relocations * 32 bit relocation repeated * @@ -263,16 +261,6 @@ static void handle_relocations(void *output, unsigned long output_len, *(uint32_t *)ptr += delta; } #ifdef CONFIG_X86_64 - while (*--reloc) { - long extended = *reloc; - extended += map; - - ptr = (unsigned long)extended; - if (ptr < min_addr || ptr > max_addr) - error("inverse 32-bit relocation outside of kernel!\n"); - - *(int32_t *)ptr -= delta; - } for (reloc--; *reloc; reloc--) { long extended = *reloc; extended += map; diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b5e3695a0615..ae6962665b35 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -29,7 +29,6 @@ static struct relocs relocs16; static struct relocs relocs32; #if ELF_BITS == 64 -static struct relocs relocs32neg; static struct relocs relocs64; # define FMT PRIu64 @@ -91,7 +90,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 - "__per_cpu_load|" "init_per_cpu__.*|" "__end_rodata_hpage_align|" #endif @@ -290,34 +288,6 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) return name; } -static Elf_Sym *sym_lookup(const char *symname) -{ - int i; - - for (i = 0; i < shnum; i++) { - struct section *sec = &secs[i]; - long nsyms; - char *strtab; - Elf_Sym *symtab; - Elf_Sym *sym; - - if (sec->shdr.sh_type != SHT_SYMTAB) - continue; - - nsyms = sec->shdr.sh_size/sizeof(Elf_Sym); - symtab = sec->symtab; - strtab = sec->link->strtab; - - for (sym = symtab; --nsyms >= 0; sym++) { - if (!sym->st_name) - continue; - if (strcmp(symname, strtab + sym->st_name) == 0) - return sym; - } - } - return 0; -} - #if BYTE_ORDER == LITTLE_ENDIAN # define le16_to_cpu(val) (val) # define le32_to_cpu(val) (val) @@ -766,78 +736,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, } } -/* - * The .data..percpu section is a special case for x86_64 SMP kernels. - * It is used to initialize the actual per_cpu areas and to provide - * definitions for the per_cpu variables that correspond to their offsets - * within the percpu area. Since the values of all of the symbols need - * to be offsets from the start of the per_cpu area the virtual address - * (sh_addr) of .data..percpu is 0 in SMP kernels. - * - * This means that: - * - * Relocations that reference symbols in the per_cpu area do not - * need further relocation (since the value is an offset relative - * to the start of the per_cpu area that does not change). - * - * Relocations that apply to the per_cpu area need to have their - * offset adjusted by by the value of __per_cpu_load to make them - * point to the correct place in the loaded image (because the - * virtual address of .data..percpu is 0). - * - * For non SMP kernels .data..percpu is linked as part of the normal - * kernel data and does not require special treatment. - * - */ -static int per_cpu_shndx = -1; -static Elf_Addr per_cpu_load_addr; - -static void percpu_init(void) -{ - int i; - - for (i = 0; i < shnum; i++) { - ElfW(Sym) *sym; - - if (strcmp(sec_name(i), ".data..percpu")) - continue; - - if (secs[i].shdr.sh_addr != 0) /* non SMP kernel */ - return; - - sym = sym_lookup("__per_cpu_load"); - if (!sym) - die("can't find __per_cpu_load\n"); - - per_cpu_shndx = i; - per_cpu_load_addr = sym->st_value; - - return; - } -} - #if ELF_BITS == 64 -/* - * Check to see if a symbol lies in the .data..percpu section. - * - * The linker incorrectly associates some symbols with the - * .data..percpu section so we also need to check the symbol - * name to make sure that we classify the symbol correctly. - * - * The GNU linker incorrectly associates: - * __init_begin - * __per_cpu_load - * - * The "gold" linker incorrectly associates: - * init_per_cpu__gdt_page - */ -static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) -{ - return 0; -} - - static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { @@ -848,12 +748,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, if (sym->st_shndx == SHN_UNDEF) return 0; - /* - * Adjust the offset if this reloc applies to the percpu section. - */ - if (sec->shdr.sh_info == per_cpu_shndx) - offset += per_cpu_load_addr; - switch (r_type) { case R_X86_64_NONE: /* NONE can be ignored. */ @@ -863,32 +757,21 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, case R_X86_64_PLT32: case R_X86_64_REX_GOTPCRELX: /* - * PC relative relocations don't need to be adjusted unless - * referencing a percpu symbol. + * PC relative relocations don't need to be adjusted. * * NB: R_X86_64_PLT32 can be treated as R_X86_64_PC32. */ - if (is_percpu_sym(sym, symname)) - add_reloc(&relocs32neg, offset); break; case R_X86_64_PC64: /* * Only used by jump labels */ - if (is_percpu_sym(sym, symname)) - die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname); break; case R_X86_64_32: case R_X86_64_32S: case R_X86_64_64: - /* - * References to the percpu area don't need to be adjusted. - */ - if (is_percpu_sym(sym, symname)) - break; - if (shn_abs) { /* * Whitelisted absolute symbols do not require @@ -1101,7 +984,6 @@ static void emit_relocs(int as_text, int use_real_mode) /* Order the relocations for more efficient processing */ sort_relocs(&relocs32); #if ELF_BITS == 64 - sort_relocs(&relocs32neg); sort_relocs(&relocs64); #else sort_relocs(&relocs16); @@ -1133,13 +1015,6 @@ static void emit_relocs(int as_text, int use_real_mode) /* Now print each relocation */ for (i = 0; i < relocs64.count; i++) write_reloc(relocs64.offset[i], stdout); - - /* Print a stop */ - write_reloc(0, stdout); - - /* Now print each inverse 32-bit relocation */ - for (i = 0; i < relocs32neg.count; i++) - write_reloc(relocs32neg.offset[i], stdout); #endif /* Print a stop */ @@ -1192,9 +1067,6 @@ void process(FILE *fp, int use_real_mode, int as_text, read_symtabs(fp); read_relocs(fp); - if (ELF_BITS == 64) - percpu_init(); - if (show_absolute_syms) { print_absolute_symbols(); return; -- cgit v1.2.3 From 38a4968b3190f873a8a60e953287278eddf037f1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:43 -0500 Subject: x86/percpu/64: Remove INIT_PER_CPU macros Now that the load and link addresses of percpu variables are the same, these macros are no longer necessary. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Reviewed-by: Uros Bizjak Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-12-brgerst@gmail.com --- arch/x86/include/asm/desc.h | 1 - arch/x86/include/asm/percpu.h | 22 ---------------------- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/irq_64.c | 1 - arch/x86/kernel/vmlinux.lds.S | 7 ------- arch/x86/tools/relocs.c | 1 - 6 files changed, 1 insertion(+), 33 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 62dc9f59ea76..ec95fe44fa3a 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -46,7 +46,6 @@ struct gdt_page { } __attribute__((aligned(PAGE_SIZE))); DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -DECLARE_INIT_PER_CPU(gdt_page); /* Provide the original GDT */ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999..1a76eb87c5d8 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -20,12 +20,6 @@ #define PER_CPU_VAR(var) __percpu(var)__percpu_rel -#ifdef CONFIG_X86_64_SMP -# define INIT_PER_CPU_VAR(var) init_per_cpu__##var -#else -# define INIT_PER_CPU_VAR(var) var -#endif - #else /* !__ASSEMBLY__: */ #include @@ -97,22 +91,6 @@ #define __percpu_arg(x) __percpu_prefix "%" #x #define __force_percpu_arg(x) __force_percpu_prefix "%" #x -/* - * Initialized pointers to per-CPU variables needed for the boot - * processor need to use these macros to get the proper address - * offset from __per_cpu_load on SMP. - * - * There also must be an entry in vmlinux_64.lds.S - */ -#define DECLARE_INIT_PER_CPU(var) \ - extern typeof(var) init_per_cpu_var(var) - -#ifdef CONFIG_X86_64_SMP -# define init_per_cpu_var(var) init_per_cpu__##var -#else -# define init_per_cpu_var(var) var -#endif - /* * For arch-specific code, we can use direct single-insn ops (they * don't give an lvalue though). diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 22c9ba305ac1..05f8b8acf784 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -567,7 +567,7 @@ void early_setup_idt(void) */ void __head startup_64_setup_gdt_idt(void) { - struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt); + struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt; void *handler = NULL; struct desc_ptr startup_gdt_descr = { diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index ade0043ce56e..56bdeecd8ee0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -27,7 +27,6 @@ #include DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; -DECLARE_INIT_PER_CPU(irq_stack_backing_store); #ifdef CONFIG_VMAP_STACK /* diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 93c2fa8a7522..1769a7126224 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -471,13 +471,6 @@ SECTIONS PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); #ifdef CONFIG_X86_64 -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_backing_store); #ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index ae6962665b35..5778bc498415 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -90,7 +90,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 - "init_per_cpu__.*|" "__end_rodata_hpage_align|" #endif "_end)$" -- cgit v1.2.3 From 95b0916118106054e1f3d5d7f8628ef3dc0b3c02 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:44 -0500 Subject: percpu: Remove PER_CPU_FIRST_SECTION x86-64 was the last user. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-13-brgerst@gmail.com --- include/asm-generic/vmlinux.lds.h | 1 - include/linux/percpu-defs.h | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 02a4adb4a999..a3c77a106565 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1062,7 +1062,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #define PERCPU_INPUT(cacheline) \ __per_cpu_start = .; \ - *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..40d34e032d5b 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -26,13 +26,11 @@ #define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" #endif -#define PER_CPU_FIRST_SECTION "..first" #else #define PER_CPU_SHARED_ALIGNED_SECTION "" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" -#define PER_CPU_FIRST_SECTION "" #endif @@ -114,16 +112,6 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") -/* - * Declaration/definition used for per-CPU variables that must come first in - * the set of variables. - */ -#define DECLARE_PER_CPU_FIRST(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) - -#define DEFINE_PER_CPU_FIRST(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) - /* * Declaration/definition used for per-CPU variables that must be cacheline * aligned under SMP conditions so that, whilst a particular instance of the -- cgit v1.2.3 From e23cff6861781ac4e15de6c7bf2d2a0b79cb52ef Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:45 -0500 Subject: percpu: Remove PERCPU_VADDR() x86-64 was the last user. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-14-brgerst@gmail.com --- include/asm-generic/vmlinux.lds.h | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index a3c77a106565..e25a8aeee29c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1073,47 +1073,13 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __per_cpu_end = .; /** - * PERCPU_VADDR - define output section for percpu area + * PERCPU_SECTION - define output section for percpu area * @cacheline: cacheline size - * @vaddr: explicit base address (optional) - * @phdr: destination PHDR (optional) * * Macro which expands to output section for percpu area. * * @cacheline is used to align subsections to avoid false cacheline * sharing between subsections for different purposes. - * - * If @vaddr is not blank, it specifies explicit base address and all - * percpu symbols will be offset from the given address. If blank, - * @vaddr always equals @laddr + LOAD_OFFSET. - * - * @phdr defines the output PHDR to use if not blank. Be warned that - * output PHDR is sticky. If @phdr is specified, the next output - * section in the linker script will go there too. @phdr should have - * a leading colon. - * - * Note that this macros defines __per_cpu_load as an absolute symbol. - * If there is no need to put the percpu section at a predetermined - * address, use PERCPU_SECTION. - */ -#define PERCPU_VADDR(cacheline, vaddr, phdr) \ - __per_cpu_load = .; \ - .data..percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \ - PERCPU_INPUT(cacheline) \ - } phdr \ - . = __per_cpu_load + SIZEOF(.data..percpu); - -/** - * PERCPU_SECTION - define output section for percpu area, simple version - * @cacheline: cacheline size - * - * Align to PAGE_SIZE and outputs output section for percpu area. This - * macro doesn't manipulate @vaddr or @phdr and __per_cpu_load and - * __per_cpu_start will be identical. - * - * This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,,) - * except that __per_cpu_load is defined as a relative symbol against - * .data..percpu which is required for relocatable x86_32 configuration. */ #define PERCPU_SECTION(cacheline) \ . = ALIGN(PAGE_SIZE); \ -- cgit v1.2.3 From 4b00c1160a13d8bf7297ebf49ec07a84e1f41132 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:46 -0500 Subject: percpu: Remove __per_cpu_load __per_cpu_load is now always equal to __per_cpu_start. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-15-brgerst@gmail.com --- include/asm-generic/sections.h | 2 +- include/asm-generic/vmlinux.lds.h | 1 - mm/percpu.c | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index c768de6f19a9..0755bc39b0d8 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -39,7 +39,7 @@ extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; -extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; +extern char __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e25a8aeee29c..92fc06f7da74 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1084,7 +1084,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define PERCPU_SECTION(cacheline) \ . = ALIGN(PAGE_SIZE); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ - __per_cpu_load = .; \ PERCPU_INPUT(cacheline) \ } diff --git a/mm/percpu.c b/mm/percpu.c index ac61e3fc5f15..7b5835356d1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3071,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3240,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ -- cgit v1.2.3 From 01157ddc58dc2fe428ec17dd5a18cc13f134639f Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:47 -0500 Subject: kallsyms: Remove KALLSYMS_ABSOLUTE_PERCPU x86-64 was the only user. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-16-brgerst@gmail.com --- init/Kconfig | 5 ---- kernel/kallsyms.c | 12 ++------- scripts/kallsyms.c | 72 +++++++++---------------------------------------- scripts/link-vmlinux.sh | 4 --- 4 files changed, 14 insertions(+), 79 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index b5d9c0fa69f6..a0ea04c17784 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1869,11 +1869,6 @@ config KALLSYMS_ALL Say N unless you really need all symbols, or kernel live patching. -config KALLSYMS_ABSOLUTE_PERCPU - bool - depends on KALLSYMS - default n - # end of the "standard kernel features (expert users)" menu config ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9a0ca605d4a..4198f30aac3c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets if --absolute-percpu is not in effect */ - if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; - - /* ...otherwise, positive offsets are absolute values */ - if (kallsyms_offsets[idx] >= 0) - return kallsyms_offsets[idx]; - - /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ - return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; + /* values are unsigned offsets */ + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 03852da3d249..4b0234e4b12f 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -5,7 +5,7 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * - * Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S + * Usage: kallsyms [--all-symbols] in.map > out.S * * Table compression uses all the unused char codes on the symbols and * maps these to the most used substrings (tokens). For instance, it might @@ -37,7 +37,6 @@ struct sym_entry { unsigned long long addr; unsigned int len; unsigned int seq; - bool percpu_absolute; unsigned char sym[]; }; @@ -55,14 +54,9 @@ static struct addr_range text_ranges[] = { #define text_range_text (&text_ranges[0]) #define text_range_inittext (&text_ranges[1]) -static struct addr_range percpu_range = { - "__per_cpu_start", "__per_cpu_end", -1ULL, 0 -}; - static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; -static int absolute_percpu; static int token_profit[0x10000]; @@ -73,7 +67,7 @@ static unsigned char best_table_len[256]; static void usage(void) { - fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S\n"); + fprintf(stderr, "Usage: kallsyms [--all-symbols] in.map > out.S\n"); exit(1); } @@ -164,7 +158,6 @@ static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) return NULL; check_symbol_range(name, addr, text_ranges, ARRAY_SIZE(text_ranges)); - check_symbol_range(name, addr, &percpu_range, 1); /* include the type field in the symbol name, so that it gets * compressed together */ @@ -175,7 +168,6 @@ static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) sym->len = len; sym->sym[0] = type; strcpy(sym_name(sym), name); - sym->percpu_absolute = false; return sym; } @@ -319,11 +311,6 @@ static int expand_symbol(const unsigned char *data, int len, char *result) return total; } -static bool symbol_absolute(const struct sym_entry *s) -{ - return s->percpu_absolute; -} - static int compare_names(const void *a, const void *b) { int ret; @@ -455,22 +442,11 @@ static void write_src(void) */ long long offset; - bool overflow; - - if (!absolute_percpu) { - offset = table[i]->addr - relative_base; - overflow = offset < 0 || offset > UINT_MAX; - } else if (symbol_absolute(table[i])) { - offset = table[i]->addr; - overflow = offset < 0 || offset > INT_MAX; - } else { - offset = relative_base - table[i]->addr - 1; - overflow = offset < INT_MIN || offset >= 0; - } - if (overflow) { + + offset = table[i]->addr - relative_base; + if (offset < 0 || offset > UINT_MAX) { fprintf(stderr, "kallsyms failure: " - "%s symbol value %#llx out of range in relative mode\n", - symbol_absolute(table[i]) ? "absolute" : "relative", + "relative symbol value %#llx out of range\n", table[i]->addr); exit(EXIT_FAILURE); } @@ -725,36 +701,15 @@ static void sort_symbols(void) qsort(table, table_cnt, sizeof(table[0]), compare_symbols); } -static void make_percpus_absolute(void) -{ - unsigned int i; - - for (i = 0; i < table_cnt; i++) - if (symbol_in_range(table[i], &percpu_range, 1)) { - /* - * Keep the 'A' override for percpu symbols to - * ensure consistent behavior compared to older - * versions of this tool. - */ - table[i]->sym[0] = 'A'; - table[i]->percpu_absolute = true; - } -} - /* find the minimum non-absolute symbol address */ static void record_relative_base(void) { - unsigned int i; - - for (i = 0; i < table_cnt; i++) - if (!symbol_absolute(table[i])) { - /* - * The table is sorted by address. - * Take the first non-absolute symbol value. - */ - relative_base = table[i]->addr; - return; - } + /* + * The table is sorted by address. + * Take the first symbol value. + */ + if (table_cnt) + relative_base = table[0]->addr; } int main(int argc, char **argv) @@ -762,7 +717,6 @@ int main(int argc, char **argv) while (1) { static const struct option long_options[] = { {"all-symbols", no_argument, &all_symbols, 1}, - {"absolute-percpu", no_argument, &absolute_percpu, 1}, {}, }; @@ -779,8 +733,6 @@ int main(int argc, char **argv) read_map(argv[optind]); shrink_table(); - if (absolute_percpu) - make_percpus_absolute(); sort_symbols(); record_relative_base(); optimize_token_table(); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 56a077d204cf..67e66333bd2a 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -144,10 +144,6 @@ kallsyms() kallsymopt="${kallsymopt} --all-symbols" fi - if is_enabled CONFIG_KALLSYMS_ABSOLUTE_PERCPU; then - kallsymopt="${kallsymopt} --absolute-percpu" - fi - info KSYMS "${2}.S" scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S" -- cgit v1.2.3 From 7861640aac52bbbb3dc2cd40fb93dfb3b3d0f43c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 20 Feb 2025 13:08:12 -0700 Subject: x86/build: Raise the minimum LLVM version to 15.0.0 In a similar vein as to this pending commit in the x86/asm tree: a3e8fe814ad1 ("x86/build: Raise the minimum GCC version to 8.1") ... bump the minimum supported version of LLVM for building x86 kernels to 15.0.0, as that is the first version that has support for '-mstack-protector-guard-symbol', which is used unconditionally after: 80d47defddc0 ("x86/stackprotector/64: Convert to normal per-CPU variable"): Older Clang versions will fail the build with: clang-14: error: unknown argument: '-mstack-protector-guard-symbol=__ref_stack_chk_guard' Fixes: 80d47defddc0 ("x86/stackprotector/64: Convert to normal per-CPU variable") Signed-off-by: Nathan Chancellor Signed-off-by: Ingo Molnar Cc: Linus Torvalds Reviewed-by: Ard Biesheuvel Reviewed-by: Brian Gerst Link: https://lore.kernel.org/r/20250220-x86-bump-min-llvm-for-stackp-v1-1-ecb3c906e790@kernel.org --- scripts/min-tool-version.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 06c4e410ecab..787868183b84 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -26,7 +26,7 @@ gcc) fi ;; llvm) - if [ "$SRCARCH" = s390 ]; then + if [ "$SRCARCH" = s390 -o "$SRCARCH" = x86 ]; then echo 15.0.0 elif [ "$SRCARCH" = loongarch ]; then echo 18.0.0 -- cgit v1.2.3 From a9ebcb88136ca80cb53de27ca5ae77de18bbe368 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 17 Feb 2025 18:38:20 +0200 Subject: mm/memremap: Pass down MEMREMAP_* flags to arch_memremap_wb() x86 version of arch_memremap_wb() needs the flags to decide if the mapping has to be encrypted or decrypted. Pass down the flag to arch_memremap_wb(). All current implementations ignore the argument. Signed-off-by: Kirill A. Shutemov Signed-off-by: Ingo Molnar Cc: Andrew Morton Cc: Dave Hansen Cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20250217163822.343400-2-kirill.shutemov@linux.intel.com --- arch/arm/include/asm/io.h | 2 +- arch/arm/mm/ioremap.c | 2 +- arch/arm/mm/nommu.c | 2 +- arch/riscv/include/asm/io.h | 2 +- kernel/iomem.c | 5 +++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 1815748f5d2a..bae5edf348ef 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -381,7 +381,7 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size); void iounmap(volatile void __iomem *io_addr); #define iounmap iounmap -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); #define arch_memremap_wb arch_memremap_wb /* diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 89f1c97f3079..748698e91a4b 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -436,7 +436,7 @@ void __arm_iomem_set_ro(void __iomem *ptr, size_t size) set_memory_ro((unsigned long)ptr, PAGE_ALIGN(size) / PAGE_SIZE); } -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (__force void *)arch_ioremap_caller(phys_addr, size, MT_MEMORY_RW, diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 1a8f6914ee59..d638cc87807e 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -248,7 +248,7 @@ void __iomem *pci_remap_cfgspace(resource_size_t res_cookie, size_t size) EXPORT_SYMBOL_GPL(pci_remap_cfgspace); #endif -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (void *)phys_addr; } diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1c5c641075d2..0257f4aa7ff4 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -136,7 +136,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) #include #ifdef CONFIG_MMU -#define arch_memremap_wb(addr, size) \ +#define arch_memremap_wb(addr, size, flags) \ ((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL)) #endif diff --git a/kernel/iomem.c b/kernel/iomem.c index dc2120776e1c..75e61c1c6bc0 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -6,7 +6,8 @@ #include #ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +static void *arch_memremap_wb(resource_size_t offset, unsigned long size, + unsigned long flags) { #ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); @@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size, flags); if (!addr) - addr = arch_memremap_wb(offset, size); + addr = arch_memremap_wb(offset, size, flags); } /* -- cgit v1.2.3 From 81256a50aa0fddefbf4849db8cad9f70c5167c04 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 17 Feb 2025 18:38:21 +0200 Subject: x86/mm: Make memremap(MEMREMAP_WB) map memory as encrypted by default Currently memremap(MEMREMAP_WB) can produce decrypted/shared mapping: memremap(MEMREMAP_WB) arch_memremap_wb() ioremap_cache() __ioremap_caller(.encrytped = false) In such cases, the IORES_MAP_ENCRYPTED flag on the memory will determine if the resulting mapping is encrypted or decrypted. Creating a decrypted mapping without explicit request from the caller is risky: - It can inadvertently expose the guest's data and compromise the guest. - Accessing private memory via shared/decrypted mapping on TDX will either trigger implicit conversion to shared or #VE (depending on VMM implementation). Implicit conversion is destructive: subsequent access to the same memory via private mapping will trigger a hard-to-debug #VE crash. The kernel already provides a way to request decrypted mapping explicitly via the MEMREMAP_DEC flag. Modify memremap(MEMREMAP_WB) to produce encrypted/private mapping by default unless MEMREMAP_DEC is specified or if the kernel runs on a machine with SME enabled. It fixes the crash due to #VE on kexec in TDX guests if CONFIG_EISA is enabled. Signed-off-by: Kirill A. Shutemov Signed-off-by: Ingo Molnar Cc: Andrew Morton Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20250217163822.343400-3-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/io.h | 3 +++ arch/x86/mm/ioremap.c | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ed580c7f9d0a..1a0dc2b2bf5b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -175,6 +175,9 @@ extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, un extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); +#define arch_memremap_wb arch_memremap_wb + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 38ff7791a9c7..42c90b420773 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -503,6 +503,14 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) +{ + if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (void __force *)ioremap_cache(phys_addr, size); + + return (void __force *)ioremap_encrypted(phys_addr, size); +} + /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access -- cgit v1.2.3 From 64aad4749d7911f8c5e69d93a929a269605dd3cb Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 16 Feb 2025 14:26:14 +0200 Subject: ACPI/processor_idle: Export acpi_processor_ffh_play_dead() The kernel test robot reported the following build error: >> ERROR: modpost: "acpi_processor_ffh_play_dead" [drivers/acpi/processor.ko] undefined! Caused by this recently merged commit: 541ddf31e300 ("ACPI/processor_idle: Add FFH state handling") The build failure is due to an oversight in the 'CONFIG_ACPI_PROCESSOR=m' case, the function export is missing. Add it. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502151207.FA9UO1iX-lkp@intel.com/ Fixes: 541ddf31e300 ("ACPI/processor_idle: Add FFH state handling") Signed-off-by: Artem Bityutskiy Signed-off-by: Ingo Molnar Cc: Dave Hansen Link: https://lore.kernel.org/r/de5bf4f116779efde315782a15146fdc77a4a044.camel@linux.intel.com --- arch/x86/kernel/acpi/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5bdb65516969..86c87c01d23d 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -214,6 +214,7 @@ void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); mwait_play_dead(percpu_entry->states[cx->index].eax); } +EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead); void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { -- cgit v1.2.3 From 282f395244df3663dc24e97a86087431c9192513 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Sat, 15 Feb 2025 12:52:49 +0000 Subject: x86/mm: Replace open-coded gap bounding with clamp() Rather than manually bounding gap between gap_min and gap_max, use the well-known clamp() macro to make the code easier to read. Signed-off-by: Qasim Ijaz Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250215125249.10729-1-qasdev00@gmail.com --- arch/x86/mm/mmap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index b8a6ffffb451..5ed2109211da 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -84,7 +84,6 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -94,13 +93,7 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M; - gap_max = (task_size / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SIZE_128M, (task_size / 6) * 5); return PAGE_ALIGN(task_size - gap - rnd); } -- cgit v1.2.3 From 4087e16b033140cf2ce509ec23503bddec818a16 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 14 Feb 2025 16:07:46 +0100 Subject: x86/locking: Use ALT_OUTPUT_SP() for percpu_{,try_}cmpxchg{64,128}_op() percpu_{,try_}cmpxchg{64,128}() macros use CALL instruction inside asm statement in one of their alternatives. Use ALT_OUTPUT_SP() macro to add required dependence on %esp register. ALT_OUTPUT_SP() implements the above dependence by adding ASM_CALL_CONSTRAINT to its arguments. This constraint should be used for any inline asm which has a CALL instruction, otherwise the compiler may schedule the asm before the frame pointer gets set up by the containing function, causing objtool to print a "call without frame pointer save/setup" warning. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250214150929.5780-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999..0ab991fba7de 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -350,9 +350,9 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ @@ -381,10 +381,10 @@ do { \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ @@ -421,9 +421,9 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ @@ -452,10 +452,10 @@ do { \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high)) \ : "b" (new__.low), \ "c" (new__.high), \ "S" (&(_var)) \ -- cgit v1.2.3 From 2d352ec9fcb5d965318e7855b2406a7a14e9ae13 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 14 Feb 2025 16:07:47 +0100 Subject: x86/locking: Use asm_inline for {,try_}cmpxchg{64,128} emulations According to: https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html the usage of asm pseudo directives in the asm template can confuse the compiler to wrongly estimate the size of the generated code. The ALTERNATIVE macro expands to several asm pseudo directives, so its usage in {,try_}cmpxchg{64,128} causes instruction length estimate to fail by an order of magnitude (the specially instrumented compiler reports the estimated length of these asm templates to be more than 20 instructions long). This incorrect estimate further causes unoptimal inlining decisions, unoptimal instruction scheduling and unoptimal code block alignments for functions that use these locking primitives. Use asm_inline instead: https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html which is a feature that makes GCC pretend some inline assembler code is tiny (while it would think it is huge), instead of just asm. For code size estimation, the size of the asm is then taken as the minimum size of one instruction, ignoring how many instructions compiler thinks it is. The effect of this patch on x86_64 target is minor, since 128-bit functions are rarely used on this target. The code size of the resulting defconfig object file stays the same: text data bss dec hex filename 27456612 4638523 814148 32909283 1f627e3 vmlinux-old.o 27456612 4638523 814148 32909283 1f627e3 vmlinux-new.o but the patch has minor effect on code layout due to the different scheduling decisions in functions containing changed macros. There is no effect on the x64_32 target, the code size of the resulting defconfig object file and the code layout stays the same: text data bss dec hex filename 18883870 2679275 1707916 23271061 1631695 vmlinux-old.o 18883870 2679275 1707916 23271061 1631695 vmlinux-new.o Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250214150929.5780-2-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 32 +++++++++------- arch/x86/include/asm/percpu.h | 77 +++++++++++++++++++-------------------- 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fd1282a783dd..95b5f990ca88 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -91,12 +91,14 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ o.full; \ }) @@ -119,14 +121,16 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - CC_SET(e) \ - : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ - "+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ if (unlikely(!ret)) \ *(_oldp) = o.full; \ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0ab991fba7de..08f5f61690b7 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -348,15 +348,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -378,17 +377,16 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - CC_SET(z) \ - : ALT_OUTPUT_SP(CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ \ @@ -419,15 +417,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -449,19 +446,19 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - CC_SET(z) \ - : ALT_OUTPUT_SP(CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high)) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ + \ likely(success); \ }) -- cgit v1.2.3 From a37259732a7dc33047fa1e4f9a338088f452e017 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Feb 2025 11:13:52 -0500 Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using paravirt, and not when running on bare metal. There is no real good reason to do things differently for each setup. Make them all the same. Currently get_user_pages_fast synchronizes against page table freeing in two different ways: - on bare metal, by blocking IRQs, which block TLB flush IPIs - on paravirt, with MMU_GATHER_RCU_TABLE_FREE This is done because some paravirt TLB flush implementations handle the TLB flush in the hypervisor, and will do the flush even when the target CPU has interrupts disabled. Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE. Using RCU synchronization between page table freeing and get_user_pages_fast() allows bare metal to also do TLB flushing while interrupts are disabled. Various places in the mm do still block IRQs or disable preemption as an implicit way to block RCU frees. That makes it safe to use INVLPGB on AMD CPUs. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Tested-by: Manali Shukla Tested-by: Brendan Jackman Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com --- arch/x86/Kconfig | 2 +- arch/x86/kernel/paravirt.c | 17 +---------------- arch/x86/mm/pgtable.c | 27 ++++----------------------- 3 files changed, 6 insertions(+), 40 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c4175f4635ee..d581634c6a59 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -278,7 +278,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccaa3397a67..527f5605aa3e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,21 +59,6 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } -#ifndef CONFIG_PT_RECLAIM -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -195,7 +180,7 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = native_tlb_remove_table, + .mmu.tlb_remove_table = tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1fef5ad32d5a..b1c1f72c1fd1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask); #define PGTABLE_HIGHMEM 0 #endif -#ifndef CONFIG_PARAVIRT -#ifndef CONFIG_PT_RECLAIM -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif /* !CONFIG_PT_RECLAIM */ -#endif /* !CONFIG_PARAVIRT */ - gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) @@ -64,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); - paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); + tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); + tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); + tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); + tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ -- cgit v1.2.3 From f2c5c21058270167ce23172022da083b62e5ad4c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Feb 2025 11:13:53 -0500 Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table. Get rid of the indirection by simply calling tlb_remove_table directly, and not going through the paravirt function pointers. Suggested-by: Qi Zheng Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Tested-by: Manali Shukla Tested-by: Brendan Jackman Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com --- arch/x86/hyperv/mmu.c | 1 - arch/x86/include/asm/paravirt.h | 5 ----- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/kernel/kvm.c | 1 - arch/x86/kernel/paravirt.c | 1 - arch/x86/xen/mmu_pv.c | 1 - 6 files changed, 11 deletions(-) diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index cc8c3bd0e7c2..1f7c3082a36d 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 041aff51eb50..38a632a282d4 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask, PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } -static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); -} - static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index fea56b04f436..e26633c00455 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -134,8 +134,6 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); - void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); - /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7a422a6c5983..3be9b3342c67 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; pr_info("KVM setup pv remote TLB flush\n"); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 527f5605aa3e..2aa251d0b308 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -180,7 +180,6 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d078de2c952b..38971c6dcd4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, -- cgit v1.2.3 From 3fcae7771fb724c276e87e80827b264d2c3ad67e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 11 Feb 2025 16:57:21 +0200 Subject: x86/pat: Fix W=1 build warning when the within_inclusive() function is unused The within_inclusive() function, in some cases, when CONFIG_X86_64=n, may be not used. This, in particular, prevents kernel builds with Clang, `make W=1` and CONFIG_WERROR=y: arch/x86/mm/pat/set_memory.c:215:1: error: unused function 'within_inclusive' [-Werror,-Wunused-function] Fix this by guarding the definitions with the respective ifdeffery. See also: 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build") Signed-off-by: Andy Shevchenko Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250211145721.1620552-1-andriy.shevchenko@linux.intel.com --- arch/x86/mm/pat/set_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 7bd0f62ba48f..84d0bca3be28 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -225,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +#ifdef CONFIG_X86_64 + static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } -#ifdef CONFIG_X86_64 - /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): -- cgit v1.2.3 From 7ffb791423c7c518269a9aad35039ef824a40adb Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 7 Feb 2025 10:42:34 +1100 Subject: x86/kaslr: Reduce KASLR entropy on most x86 systems When CONFIG_PCI_P2PDMA=y (which is basically enabled on all large x86 distros), it maps the PFN's via a ZONE_DEVICE mapping using devm_memremap_pages(). The mapped virtual address range corresponds to the pci_resource_start() of the BAR address and size corresponding to the BAR length. When KASLR is enabled, the direct map range of the kernel is reduced to the size of physical memory plus additional padding. If the BAR address is beyond this limit, PCI peer to peer DMA mappings fail. Fix this by not shrinking the size of the direct map when CONFIG_PCI_P2PDMA=y. This reduces the total available entropy, but it's better than the current work around of having to disable KASLR completely. [ mingo: Clarified the changelog to point out the broad impact ... ] Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Acked-by: Bjorn Helgaas # drivers/pci/Kconfig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Andy Lutomirski Link: https://lore.kernel.org/lkml/20250206023201.1481957-1-balbirs@nvidia.com/ Link: https://lore.kernel.org/r/20250206234234.1912585-1-balbirs@nvidia.com -- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) --- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 11a93542d198..3c306de52fd4 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -113,8 +113,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2fbd379923fd..5c3054aaec8c 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -203,6 +203,12 @@ config PCI_P2PDMA P2P DMA transactions must be between devices behind the same root port. + Enabling this option will reduce the entropy of x86 KASLR memory + regions. For example - on a 46 bit system, the entropy goes down + from 16 bits to 15 bits. The actual reduction in entropy depends + on the physical address bits, on processor features, kernel config + (5 level page table) and physical memory present on the system. + If unsure, say N. config PCI_LABEL -- cgit v1.2.3 From 43bb700cff6bc2f0d337006b864192227fb05dc1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jan 2025 17:22:52 +0100 Subject: x86/cpu: Update Intel Family comments Because who can ever remember all these names. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250127162252.GK16742@noisy.programming.kicks-ass.net --- arch/x86/include/asm/intel-family.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 8359113e3e58..f9f67afeb48a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -110,9 +110,9 @@ #define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */ -#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ @@ -126,16 +126,16 @@ #define INTEL_RAPTORLAKE_P IFM(6, 0xBA) #define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_METEORLAKE IFM(6, 0xAC) +#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */ #define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_ARROWLAKE_H IFM(6, 0xC5) +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */ #define INTEL_ARROWLAKE IFM(6, 0xC6) #define INTEL_ARROWLAKE_U IFM(6, 0xB5) -#define INTEL_LUNARLAKE_M IFM(6, 0xBD) +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */ -#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */ /* "Small Core" Processors (Atom/E-Core) */ -- cgit v1.2.3 From ec8f5b4659b4044db55e1f7d947703dd4948626c Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Mon, 27 Jan 2025 16:31:55 +0100 Subject: selftests/lam: Move cpu_has_la57() to use cpuinfo flag In current form cpu_has_la57() reports platform's support for LA57 through reading the output of cpuid. A much more useful information is whether 5-level paging is actually enabled on the running system. Check whether 5-level paging is enabled by trying to map a page in the high linear address space. Signed-off-by: Maciej Wieczor-Retman Signed-off-by: Ingo Molnar Cc: Kirill A. Shutemov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Peter Zijlstra Cc: Shuah Khan Link: https://lore.kernel.org/r/8b1ca51b13e6d94b5a42b6930d81b692cbb0bcbb.1737990375.git.maciej.wieczor-retman@intel.com --- tools/testing/selftests/x86/lam.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 4d4a76532dc9..60170a31aa69 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -124,14 +124,18 @@ static inline int cpu_has_lam(void) return (cpuinfo[0] & (1 << 26)); } -/* Check 5-level page table feature in CPUID.(EAX=07H, ECX=00H):ECX.[bit 16] */ -static inline int cpu_has_la57(void) +static inline int la57_enabled(void) { - unsigned int cpuinfo[4]; + int ret; + void *p; + + p = mmap((void *)HIGH_ADDR, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); - __cpuid_count(0x7, 0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + ret = p == MAP_FAILED ? 0 : 1; - return (cpuinfo[2] & (1 << 16)); + munmap(p, PAGE_SIZE); + return ret; } /* @@ -322,7 +326,7 @@ static int handle_mmap(struct testcases *test) flags, -1, 0); if (ptr == MAP_FAILED) { if (test->addr == HIGH_ADDR) - if (!cpu_has_la57()) + if (!la57_enabled()) return 3; /* unsupport LA57 */ return 1; } -- cgit v1.2.3 From 51f909dcd178655b104d979a6870535268724498 Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Mon, 27 Jan 2025 16:31:56 +0100 Subject: selftests/lam: Skip test if LAM is disabled Until LASS is merged into the kernel: https://lore.kernel.org/all/20241028160917.1380714-1-alexander.shishkin@linux.intel.com/ LAM is left disabled in the config file. Running the LAM selftest with disabled LAM only results in unhelpful output. Use one of LAM syscalls() to determine whether the kernel was compiled with LAM support (CONFIG_ADDRESS_MASKING) or not. Skip running the tests in the latter case. Merge CPUID checking function with the one mentioned above to achieve a single function that shows LAM's availability from both CPU and the kernel. Signed-off-by: Maciej Wieczor-Retman Signed-off-by: Ingo Molnar Cc: Kirill A. Shutemov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Peter Zijlstra Cc: Shuah Khan Link: https://lore.kernel.org/r/251d0f45f6a768030115e8d04bc85458910cb0dc.1737990375.git.maciej.wieczor-retman@intel.com --- tools/testing/selftests/x86/lam.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 60170a31aa69..df91a41a5bab 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -115,13 +115,28 @@ static void segv_handler(int sig) siglongjmp(segv_env, 1); } -static inline int cpu_has_lam(void) +static inline int lam_is_available(void) { unsigned int cpuinfo[4]; + unsigned long bits = 0; + int ret; __cpuid_count(0x7, 1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); - return (cpuinfo[0] & (1 << 26)); + /* Check if cpu supports LAM */ + if (!(cpuinfo[0] & (1 << 26))) { + ksft_print_msg("LAM is not supported!\n"); + return 0; + } + + /* Return 0 if CONFIG_ADDRESS_MASKING is not set */ + ret = syscall(SYS_arch_prctl, ARCH_GET_MAX_TAG_BITS, &bits); + if (ret) { + ksft_print_msg("LAM is disabled in the kernel!\n"); + return 0; + } + + return 1; } static inline int la57_enabled(void) @@ -1185,10 +1200,8 @@ int main(int argc, char **argv) tests_cnt = 0; - if (!cpu_has_lam()) { - ksft_print_msg("Unsupported LAM feature!\n"); + if (!lam_is_available()) return KSFT_SKIP; - } while ((c = getopt(argc, argv, "ht:")) != -1) { switch (c) { -- cgit v1.2.3 From 782b819827ee84532f3069e37aa091c1be00fa44 Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Mon, 27 Jan 2025 16:31:57 +0100 Subject: selftests/lam: Test get_user() LAM pointer handling Recent change in how get_user() handles pointers: https://lore.kernel.org/all/20241024013214.129639-1-torvalds@linux-foundation.org/ has a specific case for LAM. It assigns a different bitmask that's later used to check whether a pointer comes from userland in get_user(). Add test case to LAM that utilizes a ioctl (FIOASYNC) syscall which uses get_user() in its implementation. Execute the syscall with differently tagged pointers to verify that valid user pointers are passing through and invalid kernel/non-canonical pointers are not. Signed-off-by: Maciej Wieczor-Retman Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Kirill A. Shutemov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Peter Zijlstra Cc: Shuah Khan Link: https://lore.kernel.org/r/1624d9d1b9502517053a056652d50dc5d26884ac.1737990375.git.maciej.wieczor-retman@intel.com --- tools/testing/selftests/x86/lam.c | 108 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index df91a41a5bab..b6166a5f37d6 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,15 @@ #define FUNC_INHERITE 0x20 #define FUNC_PASID 0x40 +/* get_user() pointer test cases */ +#define GET_USER_USER 0 +#define GET_USER_KERNEL_TOP 1 +#define GET_USER_KERNEL_BOT 2 +#define GET_USER_KERNEL 3 + #define TEST_MASK 0x7f +#define L5_SIGN_EXT_MASK (0xFFUL << 56) +#define L4_SIGN_EXT_MASK (0x1FFFFUL << 47) #define LOW_ADDR (0x1UL << 30) #define HIGH_ADDR (0x3UL << 48) @@ -389,6 +398,78 @@ static int handle_syscall(struct testcases *test) return ret; } +static int get_user_syscall(struct testcases *test) +{ + uint64_t ptr_address, bitmask; + int fd, ret = 0; + void *ptr; + + if (la57_enabled()) { + bitmask = L5_SIGN_EXT_MASK; + ptr_address = HIGH_ADDR; + } else { + bitmask = L4_SIGN_EXT_MASK; + ptr_address = LOW_ADDR; + } + + ptr = mmap((void *)ptr_address, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + + if (ptr == MAP_FAILED) { + perror("failed to map byte to pass into get_user"); + return 1; + } + + if (set_lam(test->lam) != 0) { + ret = 2; + goto error; + } + + fd = memfd_create("lam_ioctl", 0); + if (fd == -1) { + munmap(ptr, PAGE_SIZE); + exit(EXIT_FAILURE); + } + + switch (test->later) { + case GET_USER_USER: + /* Control group - properly tagged user pointer */ + ptr = (void *)set_metadata((uint64_t)ptr, test->lam); + break; + case GET_USER_KERNEL_TOP: + /* Kernel address with top bit cleared */ + bitmask &= (bitmask >> 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL_BOT: + /* Kernel address with bottom sign-extension bit cleared */ + bitmask &= (bitmask << 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL: + /* Try to pass a kernel address */ + ptr = (void *)((uint64_t)ptr | bitmask); + break; + default: + printf("Invalid test case value passed!\n"); + break; + } + + /* + * Use FIOASYNC ioctl because it utilizes get_user() internally and is + * very non-invasive to the system. Pass differently tagged pointers to + * get_user() in order to verify that valid user pointers are going + * through and invalid kernel/non-canonical pointers are not. + */ + if (ioctl(fd, FIOASYNC, ptr) != 0) + ret = 1; + + close(fd); +error: + munmap(ptr, PAGE_SIZE); + return ret; +} + int sys_uring_setup(unsigned int entries, struct io_uring_params *p) { return (int)syscall(__NR_io_uring_setup, entries, p); @@ -902,6 +983,33 @@ static struct testcases syscall_cases[] = { .test_func = handle_syscall, .msg = "SYSCALL:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", }, + { + .later = GET_USER_USER, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER: get_user() and pass a properly tagged user pointer.\n", + }, + { + .later = GET_USER_KERNEL_TOP, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the top bit cleared.\n", + }, + { + .later = GET_USER_KERNEL_BOT, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the bottom sign-extension bit cleared.\n", + }, + { + .later = GET_USER_KERNEL, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() and pass a kernel pointer.\n", + }, }; static struct testcases mmap_cases[] = { -- cgit v1.2.3 From dc8bd769e70ecae0916bf1b05acad6120c6bd6f0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 23 Feb 2025 17:13:38 +0100 Subject: x86/ioperm: Use atomic64_inc_return() in ksys_ioperm() Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation on targets that define atomic_inc_return() and to remove now unneeded initialization of the %eax/%edx register pair before the call to atomic64_inc_return(). On x86_32 the code improves from: 1b0: b9 00 00 00 00 mov $0x0,%ecx 1b1: R_386_32 .bss 1b5: 89 43 0c mov %eax,0xc(%ebx) 1b8: 31 d2 xor %edx,%edx 1ba: b8 01 00 00 00 mov $0x1,%eax 1bf: e8 fc ff ff ff call 1c0 1c0: R_386_PC32 atomic64_add_return_cx8 1c4: 89 03 mov %eax,(%ebx) 1c6: 89 53 04 mov %edx,0x4(%ebx) to: 1b0: be 00 00 00 00 mov $0x0,%esi 1b1: R_386_32 .bss 1b5: 89 43 0c mov %eax,0xc(%ebx) 1b8: e8 fc ff ff ff call 1b9 1b9: R_386_PC32 atomic64_inc_return_cx8 1bd: 89 03 mov %eax,(%ebx) 1bf: 89 53 04 mov %edx,0x4(%ebx) Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250223161355.3607-1-ubizjak@gmail.com --- arch/x86/kernel/ioport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index e2fab3ceb09f..6290dd120f5e 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -144,7 +144,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * Update the sequence number to force a TSS update on return to * user mode. */ - iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); + iobm->sequence = atomic64_inc_return(&io_bitmap_sequence); return 0; } -- cgit v1.2.3 From d40459cc157f8ed8d28434c761ca7010630351be Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 24 Feb 2025 08:16:34 +0100 Subject: x86/percpu: Unify __pcpu_op{1,2}_N() macros to __pcpu_op_N() Unify __pcpu_op1_N() and __pcpu_op2_N() macros to __pcpu_op_N() by applying the macro only to asm mnemonic, not to the mnemonic plus its arguments. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250224071648.15913-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 1a76eb87c5d8..c2a9dfce36a5 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -106,15 +106,10 @@ #define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) #define __pcpu_cast_8(val) ((u64)(val)) -#define __pcpu_op1_1(op, dst) op "b " dst -#define __pcpu_op1_2(op, dst) op "w " dst -#define __pcpu_op1_4(op, dst) op "l " dst -#define __pcpu_op1_8(op, dst) op "q " dst - -#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst -#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst -#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst -#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst +#define __pcpu_op_1(op) op "b " +#define __pcpu_op_2(op) op "w " +#define __pcpu_op_4(op) op "l " +#define __pcpu_op_8(op) op "q " #define __pcpu_reg_1(mod, x) mod "q" (x) #define __pcpu_reg_2(mod, x) mod "r" (x) @@ -146,7 +141,8 @@ do { \ ({ \ __pcpu_type_##size pfo_val__; \ \ - asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), "%[val]") \ + asm qual (__pcpu_op_##size("mov") \ + __percpu_arg([var]) ", %[val]" \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "m" (__my_cpu_var(_var))); \ \ @@ -162,7 +158,8 @@ do { \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size("mov", "%[val]", __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("mov") "%[val], " \ + __percpu_arg([var]) \ : [var] "=m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) @@ -179,7 +176,8 @@ do { \ ({ \ __pcpu_type_##size pfo_val__; \ \ - asm(__pcpu_op2_##size("mov", __force_percpu_arg(a[var]), "%[val]") \ + asm(__pcpu_op_##size("mov") \ + __force_percpu_arg(a[var]) ", %[val]" \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "i" (&(_var))); \ \ @@ -188,7 +186,7 @@ do { \ #define percpu_unary_op(size, qual, op, _var) \ ({ \ - asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ + asm qual (__pcpu_op_##size(op) __percpu_arg([var]) \ : [var] "+m" (__my_cpu_var(_var))); \ }) @@ -201,7 +199,7 @@ do { \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ + asm qual (__pcpu_op_##size(op) "%[val], " __percpu_arg([var]) \ : [var] "+m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) @@ -237,8 +235,8 @@ do { \ ({ \ __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \ \ - asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("xadd") "%[tmp], " \ + __percpu_arg([var]) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ @@ -281,8 +279,8 @@ do { \ __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ : [oval] "+a" (pco_old__), \ [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ @@ -298,8 +296,8 @@ do { \ __pcpu_type_##size pco_old__ = *pco_oval__; \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ CC_SET(z) \ : CC_OUT(z) (success), \ [oval] "+a" (pco_old__), \ -- cgit v1.2.3 From c4f23a9d6e7314060ccf5f089eda179cdcc3b36a Mon Sep 17 00:00:00 2001 From: liuye Date: Tue, 14 Jan 2025 16:26:50 +0800 Subject: selftests/x86/lam: Fix minor memory in do_uring() Exception branch returns without freeing 'fi'. Signed-off-by: liuye Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250114082650.113105-1-liuye@kylinos.cn --- tools/testing/selftests/x86/lam.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 4d4a76532dc9..309c93e15aba 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -596,8 +596,10 @@ int do_uring(unsigned long lam) fi->file_fd = file_fd; ring = malloc(sizeof(*ring)); - if (!ring) + if (!ring) { + free(fi); return 1; + } memset(ring, 0, sizeof(struct io_ring)); -- cgit v1.2.3 From 8e8f0306497dea58fb4e8e2558949daae5eeac5c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 25 Feb 2025 14:16:19 +0100 Subject: x86/mtrr: Remove unnecessary strlen() in mtrr_write() The local variable length already holds the string length after calling strncpy_from_user(). Using another local variable linlen and calling strlen() is therefore unnecessary and can be removed. Remove linlen and strlen() and use length instead. No change in functionality intended. Signed-off-by: Thorsten Blum Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250225131621.329699-2-thorsten.blum@linux.dev --- arch/x86/kernel/cpu/mtrr/if.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a5c506f6da7f..4049235b1bfe 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) char *ptr; char line[LINE_SIZE]; int length; - size_t linelen; memset(line, 0, LINE_SIZE); @@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) if (length < 0) return length; - linelen = strlen(line); - ptr = line + linelen - 1; - if (linelen && *ptr == '\n') + ptr = line + length - 1; + if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { -- cgit v1.2.3 From 79165720f31868d9a9f7e5a50a09d5fe510d1822 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 25 Feb 2025 21:02:20 +0100 Subject: x86/percpu: Construct __percpu_seg_override from __percpu_seg Construct __percpu_seg_override macro from __percpu_seg by concatenating the later with __seg_ prefix to reduce ifdeffery. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250225200235.48007-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index c2a9dfce36a5..7cb4f64b2e60 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -22,6 +22,7 @@ #else /* !__ASSEMBLY__: */ +#include #include #include #include @@ -35,12 +36,7 @@ # define __seg_fs __attribute__((address_space(__seg_fs))) #endif -#ifdef CONFIG_X86_64 -# define __percpu_seg_override __seg_gs -#else -# define __percpu_seg_override __seg_fs -#endif - +#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg) #define __percpu_prefix "" #else /* !CONFIG_CC_HAS_NAMED_AS: */ -- cgit v1.2.3 From c1fcf41cf37f7a3fd3bbf6f0c04aba3ea4258888 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 25 Feb 2025 19:37:32 +0000 Subject: x86/mm: Clear _PAGE_DIRTY for kernel mappings when we clear _PAGE_RW The bit pattern of _PAGE_DIRTY set and _PAGE_RW clear is used to mark shadow stacks. This is currently checked for in mk_pte() but not pfn_pte(). If we add the check to pfn_pte(), it catches vfree() calling set_direct_map_invalid_noflush() which calls __change_page_attr() which loads the old protection bits from the PTE, clears the specified bits and uses pfn_pte() to construct the new PTE. We should, therefore, for kernel mappings, clear the _PAGE_DIRTY bit consistently whenever we clear _PAGE_RW. I opted to do it in the callers in case we want to use __change_page_attr() to create shadow stacks inside the kernel at some point in the future. Arguably, we might also want to clear _PAGE_ACCESSED here. Note that the 3 functions involved: __set_pages_np() kernel_map_pages_in_pgd() kernel_unmap_pages_in_pgd() Only ever manipulate non-swappable kernel mappings, so maintaining the DIRTY:1|RW:0 special pattern for shadow stacks and DIRTY:0 pattern for non-shadow-stack entries can be maintained consistently and doesn't result in the unintended clearing of a live dirty bit that could corrupt (destroy) dirty bit information for user mappings. Reported-by: kernel test robot Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/174051422675.10177.13226545170101706336.tip-bot2@tip-bot2 Closes: https://lore.kernel.org/oe-lkp/202502241646.719f4651-lkp@intel.com --- arch/x86/mm/pat/set_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 84d0bca3be28..d1740159029e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2628,7 +2628,7 @@ static int __set_pages_np(struct page *page, int numpages) .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* @@ -2715,7 +2715,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; @@ -2758,7 +2758,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; -- cgit v1.2.3 From 6ac43f2be982ea54b75206dccd33f4cf81bfdc39 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:05 +0100 Subject: x86/Kconfig: Add cmpxchg8b support back to Geode CPUs An older cleanup of mine inadvertently removed geode-gx1 and geode-lx from the list of CPUs that are known to support a working cmpxchg8b. Fixes: 88a2b4edda3d ("x86/Kconfig: Rework CONFIG_X86_PAE dependency") Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250226213714.4040853-2-arnd@kernel.org --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2a7279d80460..42e6a40876ea 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -368,7 +368,7 @@ config X86_HAVE_PAE config X86_CMPXCHG64 def_bool y - depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 + depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX # this should be set for all -march=.. options where the compiler # generates cmov. -- cgit v1.2.3 From 0abf508675c0dbbca6a387842f90db60756c4af5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:06 +0100 Subject: x86/smp: Drop 32-bit "bigsmp" machine support The x86-32 kernel used to support multiple platforms with more than eight logical CPUs, from the 1999-2003 timeframe: Sequent NUMA-Q, IBM Summit, Unisys ES7000 and HP F8. Support for all except the latter was dropped back in 2014, leaving only the F8 based DL740 and DL760 G2 machines in this catery, with up to eight single-core Socket-603 Xeon-MP processors with hyperthreading. Like the already removed machines, the HP F8 servers at the time cost upwards of $100k in typical configurations, but were quickly obsoleted by their 64-bit Socket-604 cousins and the AMD Opteron. Earlier servers with up to 8 Pentium Pro or Xeon processors remain fully supported as they had no hyperthreading. Similarly, the more common 4-socket Xeon-MP machines with hyperthreading using Intel or ServerWorks chipsets continue to work without this, and all the multi-core Xeon processors also run 64-bit kernels. While the "bigsmp" support can also be used to run on later 64-bit machines (including VM guests), it seems best to discourage that and get any remaining users to update their kernels to 64-bit builds on these. As a side-effect of this, there is also no more need to support NUMA configurations on 32-bit x86, as all true 32-bit NUMA platforms are already gone. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-3-arnd@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 4 - arch/x86/Kconfig | 20 +---- arch/x86/kernel/apic/Makefile | 3 - arch/x86/kernel/apic/apic.c | 3 - arch/x86/kernel/apic/bigsmp_32.c | 105 ------------------------ arch/x86/kernel/apic/local.h | 13 --- arch/x86/kernel/apic/probe_32.c | 29 ------- 7 files changed, 4 insertions(+), 173 deletions(-) delete mode 100644 arch/x86/kernel/apic/bigsmp_32.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec8..8f923770a566 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -416,10 +416,6 @@ Format: { quiet (default) | verbose | debug } Change the amount of debugging information output when initialising the APIC and IO-APIC components. - For X86-32, this can also be used to specify an APIC - driver name. - Format: apic=driver_name - Examples: apic=bigsmp apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d581634c6a59..887b77bdeb06 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -531,12 +531,6 @@ config X86_FRED ring transitions and exception/interrupt handling if the system supports it. -config X86_BIGSMP - bool "Support for big SMP systems with more than 8 CPUs" - depends on SMP && X86_32 - help - This option is needed for the systems that have more than 8 CPUs. - config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -735,8 +729,8 @@ config X86_32_NON_STANDARD depends on X86_32 && SMP depends on X86_EXTENDED_PLATFORM help - This option compiles in the bigsmp and STA2X11 default - subarchitectures. It is intended for a generic binary + This option compiles in the STA2X11 default + subarchitecture. It is intended for a generic binary kernel. If you select them all, kernel will probe it one by one and will fallback to default. @@ -1013,8 +1007,7 @@ config NR_CPUS_RANGE_BEGIN config NR_CPUS_RANGE_END int depends on X86_32 - default 64 if SMP && X86_BIGSMP - default 8 if SMP && !X86_BIGSMP + default 8 if SMP default 1 if !SMP config NR_CPUS_RANGE_END @@ -1027,7 +1020,6 @@ config NR_CPUS_RANGE_END config NR_CPUS_DEFAULT int depends on X86_32 - default 32 if X86_BIGSMP default 8 if SMP default 1 if !SMP @@ -1574,8 +1566,7 @@ config AMD_MEM_ENCRYPT config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) - default y if X86_BIGSMP + depends on X86_64 select USE_PERCPU_NUMA_NODE_ID select OF_NUMA if OF help @@ -1588,9 +1579,6 @@ config NUMA For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. - For 32-bit this is only needed if you boot a 32-bit - kernel on a 64-bit NUMA platform. - Otherwise, you should say N. config AMD_NUMA diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3bf0487cf3b7..52d1808ee360 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o endif -# APIC probe will depend on the listing order here -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o - # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e893dc6f11c1..ddca8da6d468 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1371,8 +1371,6 @@ void __init apic_intr_mode_init(void) x86_64_probe_apic(); - x86_32_install_bigsmp(); - if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1674,7 +1672,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } topology_register_boot_apic(boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c deleted file mode 100644 index 9285d500d5b4..000000000000 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. - * - * Drives the local APIC in "clustered mode". - */ -#include -#include -#include - -#include -#include - -#include "local.h" - -static u32 bigsmp_get_apic_id(u32 x) -{ - return (x >> 24) & 0xFF; -} - -static void bigsmp_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void bigsmp_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - -static int dmi_bigsmp; /* can be set by dmi scanners */ - -static int hp_ht_bigsmp(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); - dmi_bigsmp = 1; - - return 0; -} - - -static const struct dmi_system_id bigsmp_dmi_table[] = { - { hp_ht_bigsmp, "HP ProLiant DL760 G2", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P44-"), - } - }, - - { hp_ht_bigsmp, "HP ProLiant DL740", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P47-"), - } - }, - { } /* NULL entry stops DMI scanning */ -}; - -static int probe_bigsmp(void) -{ - return dmi_check_system(bigsmp_dmi_table); -} - -static struct apic apic_bigsmp __ro_after_init = { - - .name = "bigsmp", - .probe = probe_bigsmp, - - .dest_mode_logical = false, - - .disable_esr = 1, - - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = bigsmp_get_apic_id, - - .calc_dest_apicid = apic_default_calc_apicid, - - .send_IPI = default_send_IPI_single_phys, - .send_IPI_mask = default_send_IPI_mask_sequence_phys, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = bigsmp_send_IPI_allbutself, - .send_IPI_all = bigsmp_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -bool __init apic_bigsmp_possible(bool cmdline_override) -{ - return apic == &apic_bigsmp || !cmdline_override; -} - -void __init apic_bigsmp_force(void) -{ - if (apic != &apic_bigsmp) - apic_install_driver(&apic_bigsmp); -} - -apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 842fe28496be..bdcf609eb283 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -65,17 +65,4 @@ void default_send_IPI_self(int vector); void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); -void x86_32_probe_bigsmp_early(void); -void x86_32_install_bigsmp(void); -#else -static inline void x86_32_probe_bigsmp_early(void) { } -static inline void x86_32_install_bigsmp(void) { } -#endif - -#ifdef CONFIG_X86_BIGSMP -bool apic_bigsmp_possible(bool cmdline_selected); -void apic_bigsmp_force(void); -#else -static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; -static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f75ee345c02d..87bc9e7ca5d6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -93,35 +93,6 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init x86_32_probe_bigsmp_early(void) -{ - if (nr_cpu_ids <= 8 || xen_pv_domain()) - return; - - if (IS_ENABLED(CONFIG_X86_BIGSMP)) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(boot_cpu_apic_version)) - break; - /* P4 and above */ - fallthrough; - case X86_VENDOR_HYGON: - case X86_VENDOR_AMD: - if (apic_bigsmp_possible(cmdline_apic)) - return; - break; - } - } - pr_info("Limiting to 8 possible CPUs\n"); - set_nr_cpu_ids(8); -} - -void __init x86_32_install_bigsmp(void) -{ - if (nr_cpu_ids > 8 && !xen_pv_domain()) - apic_bigsmp_force(); -} - void __init x86_32_probe_apic(void) { if (!cmdline_apic) { -- cgit v1.2.3 From fc2d5cbe541032e74a66599ba843803cebbfed0e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:07 +0100 Subject: x86/build: Rework CONFIG_GENERIC_CPU compiler flags Building an x86-64 kernel with CONFIG_GENERIC_CPU is documented to run on all CPUs, but the Makefile does not actually pass an -march= argument, instead relying on the default that was used to configure the toolchain. In many cases, gcc will be configured to -march=x86-64 or -march=k8 for maximum compatibility, but in other cases a distribution default may be either raised to a more recent ISA, or set to -march=native to build for the CPU used for compilation. This still works in the case of building a custom kernel for the local machine. The point where it breaks down is building a kernel for another machine that is older the the default target. Changing the default to -march=x86-64 would make it work reliable, but possibly produce worse code on distros that intentionally default to a newer ISA. To allow reliably building a kernel for either the oldest x86-64 CPUs, pass the -march=x86-64 flag to the compiler. This was not possible in early versions of x86-64 gcc, but works on all currently supported versions down to at least gcc-5. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-4-arnd@kernel.org --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b773b34768d..5af3172fd51c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -183,14 +183,14 @@ else cflags-$(CONFIG_MPSC) += -march=nocona cflags-$(CONFIG_MCORE2) += -march=core2 cflags-$(CONFIG_MATOM) += -march=atom - cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic + cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64 -mtune=generic KBUILD_CFLAGS += $(cflags-y) rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom - rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic + rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64 -Ztune-cpu=generic KBUILD_RUSTFLAGS += $(rustflags-y) KBUILD_CFLAGS += -mno-red-zone -- cgit v1.2.3 From f388f60ca9041a95c9b3f157d316ed7c8f297e44 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:08 +0100 Subject: x86/cpu: Drop configuration options for early 64-bit CPUs The x86 CPU selection menu is confusing for a number of reasons: When configuring 32-bit kernels, it shows a small number of early 64-bit microarchitectures (K8, Core 2) but not the regular generic 64-bit target that is the normal default. There is no longer a reason to run 32-bit kernels on production 64-bit systems, so only actual 32-bit CPUs need to be shown here. When configuring 64-bit kernels, the options also pointless as there is no way to pick any CPU from the past 15 years, leaving GENERIC_CPU as the only sensible choice. Address both of the above by removing the obsolete options and making all 64-bit kernels run on both Intel and AMD CPUs from any generation. Testing generic 32-bit kernels on 64-bit hardware remains possible, just not building a 32-bit kernel that requires a 64-bit CPU. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-5-arnd@kernel.org --- arch/x86/Kconfig.cpu | 95 ++++++----------------------------------- arch/x86/Makefile | 16 +------ arch/x86/Makefile_32.cpu | 5 +-- arch/x86/include/asm/vermagic.h | 4 -- drivers/misc/mei/Kconfig | 2 +- 5 files changed, 18 insertions(+), 104 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 42e6a40876ea..8fcb8ccee44b 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # Put here option for CPU selection and depending optimization choice - prompt "Processor family" - default M686 if X86_32 - default GENERIC_CPU if X86_64 + prompt "x86-32 Processor family" + depends on X86_32 + default M686 help This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -31,7 +31,6 @@ choice - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). - - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs. - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. @@ -42,13 +41,10 @@ choice - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. - - "Intel P4" for the Pentium 4/Netburst microarchitecture. - - "Core 2/newer Xeon" for all core2 and newer Intel CPUs. - "Intel Atom" for the Atom-microarchitecture CPUs. - - "Generic-x86-64" for a kernel which runs on any x86-64 CPU. See each option's help text for additional details. If you don't know - what to do, choose "486". + what to do, choose "Pentium-Pro". config M486SX bool "486SX" @@ -114,11 +110,11 @@ config MPENTIUMIII extensions. config MPENTIUMM - bool "Pentium M" + bool "Pentium M/Pentium Dual Core/Core Solo/Core Duo" depends on X86_32 help Select this for Intel Pentium M (not Pentium-4 M) - notebook chips. + "Merom" Core Solo/Duo notebook chips config MPENTIUM4 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" @@ -139,22 +135,10 @@ config MPENTIUM4 -Mobile Pentium 4 -Mobile Pentium 4 M -Extreme Edition (Gallatin) - -Prescott - -Prescott 2M - -Cedar Mill - -Presler - -Smithfiled Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename: -Foster -Prestonia -Gallatin - -Nocona - -Irwindale - -Cranford - -Potomac - -Paxville - -Dempsey - config MK6 bool "K6/K6-II/K6-III" @@ -172,13 +156,6 @@ config MK7 some extended instructions, and passes appropriate optimization flags to GCC. -config MK8 - bool "Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. - Enables use of some extended instructions, and passes appropriate - optimization flags to GCC. - config MCRUSOE bool "Crusoe" depends on X86_32 @@ -258,42 +235,14 @@ config MVIAC7 Select this for a VIA C7. Selecting this uses the correct cache shift and tells gcc to treat the CPU as a 686. -config MPSC - bool "Intel P4 / older Netburst based Xeon" - depends on X86_64 - help - Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey - Xeon CPUs with Intel 64bit which is compatible with x86-64. - Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distinguish them - using the cpu family field - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. - -config MCORE2 - bool "Core 2/newer Xeon" - help - - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and - 53xx) CPUs. You can distinguish newer from older Xeons by the CPU - family in /proc/cpuinfo. Newer ones have 6 and older ones 15 - (not a typo) - config MATOM bool "Intel Atom" help - Select this for the Intel Atom platform. Intel Atom CPUs have an in-order pipelining architecture and thus can benefit from accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. -config GENERIC_CPU - bool "Generic-x86-64" - depends on X86_64 - help - Generic x86-64 CPU. - Run equally well on all x86-64 CPUs. - endchoice config X86_GENERIC @@ -317,8 +266,8 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int - default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "7" if MPENTIUM4 + default "6" if MK7 || MPENTIUMM || MATOM || MVIAC7 || X86_GENERIC || X86_64 default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -336,35 +285,19 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK7 || MEFFICEON config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM - -# -# P6_NOPs are a relatively minor optimization that require a family >= -# 6 processor, except that it is broken on certain VIA chips. -# Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). In addition, it looks like Virtual PC -# does not understand them. -# -# As a result, disallow these if we're not compiling for X86_64 (these -# NOPs do work on all x86-64 capable chips); the list of processors in -# the right-hand clause are the cores that benefit from this optimization. -# -config X86_P6_NOP - def_bool y - depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MATOM config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MATOM) || X86_64 config X86_HAVE_PAE def_bool y - depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64 + depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64 config X86_CMPXCHG64 def_bool y @@ -374,12 +307,12 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || MATOM || MGEODE_LX || X86_64) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7) default "5" if X86_32 && X86_CMPXCHG64 default "4" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5af3172fd51c..8120085b00a4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -178,20 +178,8 @@ else # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) - cflags-$(CONFIG_MK8) += -march=k8 - cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom - cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64 -mtune=generic - KBUILD_CFLAGS += $(cflags-y) - - rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 - rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona - rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 - rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom - rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64 -Ztune-cpu=generic - KBUILD_RUSTFLAGS += $(rustflags-y) + KBUILD_CFLAGS += -march=x86-64 -mtune=generic + KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 94834c4b5e5e..af7de9a42752 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -24,7 +24,6 @@ cflags-$(CONFIG_MK6) += -march=k6 # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -32,9 +31,7 @@ cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 -cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MATOM) += -march=atom # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..5d471253c755 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -15,8 +15,6 @@ #define MODULE_PROC_FAMILY "586TSC " #elif defined CONFIG_M586MMX #define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,8 +31,6 @@ #define MODULE_PROC_FAMILY "K6 " #elif defined CONFIG_MK7 #define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/drivers/misc/mei/Kconfig b/drivers/misc/mei/Kconfig index 67d9391f1855..7575fee96cc6 100644 --- a/drivers/misc/mei/Kconfig +++ b/drivers/misc/mei/Kconfig @@ -3,7 +3,7 @@ config INTEL_MEI tristate "Intel Management Engine Interface" depends on X86 && PCI - default GENERIC_CPU || MCORE2 || MATOM || X86_GENERIC + default X86_64 || MATOM help The Intel Management Engine (Intel ME) provides Manageability, Security and Media services for system containing Intel chipsets. -- cgit v1.2.3 From bbeb69ce301323e84f1677484eb8e4cd8fb1f9f8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:09 +0100 Subject: x86/mm: Remove CONFIG_HIGHMEM64G support HIGHMEM64G support was added in linux-2.3.25 to support (then) high-end Pentium Pro and Pentium III Xeon servers with more than 4GB of addressing, NUMA and PCI-X slots started appearing. I have found no evidence of this ever being used in regular dual-socket servers or consumer devices, all the users seem obsolete these days, even by i386 standards: - Support for NUMA servers (NUMA-Q, IBM x440, unisys) was already removed ten years ago. - 4+ socket non-NUMA servers based on Intel 450GX/450NX, HP F8 and ServerWorks ServerSet/GrandChampion could theoretically still work with 8GB, but these were exceptionally rare even 20 years ago and would have usually been equipped with than the maximum amount of RAM. - Some SKUs of the Celeron D from 2004 had 64-bit mode fused off but could still work in a Socket 775 mainboard designed for the later Core 2 Duo and 8GB. Apparently most BIOSes at the time only allowed 64-bit CPUs. - The rare Xeon LV "Sossaman" came on a few motherboards with registered DDR2 memory support up to 16GB. - In the early days of x86-64 hardware, there was sometimes the need to run a 32-bit kernel to work around bugs in the hardware drivers, or in the syscall emulation for 32-bit userspace. This likely still works but there should never be a need for this any more. PAE mode is still required to get access to the 'NX' bit on Atom 'Pentium M' and 'Core Duo' CPUs. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-6-arnd@kernel.org --- Documentation/admin-guide/kdump/kdump.rst | 4 --- Documentation/arch/x86/usb-legacy-support.rst | 11 +------ arch/x86/Kconfig | 46 ++++----------------------- arch/x86/configs/xen.config | 2 -- arch/x86/include/asm/page_32_types.h | 4 +-- arch/x86/mm/init_32.c | 9 ++---- 6 files changed, 11 insertions(+), 65 deletions(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 5376890adbeb..1f7f14c6e184 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -180,10 +180,6 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) 1) On i386, enable high memory support under "Processor type and features":: - CONFIG_HIGHMEM64G=y - - or:: - CONFIG_HIGHMEM4G 2) With CONFIG_SMP=y, usually nr_cpus=1 need specified on the kernel diff --git a/Documentation/arch/x86/usb-legacy-support.rst b/Documentation/arch/x86/usb-legacy-support.rst index e01c08b7c981..b17bf122270a 100644 --- a/Documentation/arch/x86/usb-legacy-support.rst +++ b/Documentation/arch/x86/usb-legacy-support.rst @@ -20,11 +20,7 @@ It has several drawbacks, though: features (wheel, extra buttons, touchpad mode) of the real PS/2 mouse may not be available. -2) If CONFIG_HIGHMEM64G is enabled, the PS/2 mouse emulation can cause - system crashes, because the SMM BIOS is not expecting to be in PAE mode. - The Intel E7505 is a typical machine where this happens. - -3) If AMD64 64-bit mode is enabled, again system crashes often happen, +2) If AMD64 64-bit mode is enabled, again system crashes often happen, because the SMM BIOS isn't expecting the CPU to be in 64-bit mode. The BIOS manufacturers only test with Windows, and Windows doesn't do 64-bit yet. @@ -38,11 +34,6 @@ Problem 1) compiled-in, too. Problem 2) - can currently only be solved by either disabling HIGHMEM64G - in the kernel config or USB Legacy support in the BIOS. A BIOS update - could help, but so far no such update exists. - -Problem 3) is usually fixed by a BIOS update. Check the board manufacturers web site. If an update is not available, disable USB Legacy support in the BIOS. If this alone doesn't help, try also adding diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 887b77bdeb06..737a0c630527 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1388,15 +1388,11 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -choice - prompt "High Memory Support" - default HIGHMEM4G +config HIGHMEM4G + bool "High Memory Support" depends on X86_32 - -config NOHIGHMEM - bool "off" help - Linux can use up to 64 Gigabytes of physical memory on x86 systems. + Linux can use up to 4 Gigabytes of physical memory on x86 systems. However, the address space of 32-bit x86 processors is only 4 Gigabytes large. That means that, if you have a large amount of physical memory, not all of it can be "permanently mapped" by the @@ -1412,38 +1408,9 @@ config NOHIGHMEM possible. If the machine has between 1 and 4 Gigabytes physical RAM, then - answer "4GB" here. + answer "Y" here. - If more than 4 Gigabytes is used then answer "64GB" here. This - selection turns Intel PAE (Physical Address Extension) mode on. - PAE implements 3-level paging on IA32 processors. PAE is fully - supported by Linux, PAE mode is implemented on all recent Intel - processors (Pentium Pro and better). NOTE: If you say "64GB" here, - then the kernel will not boot on CPUs that don't support PAE! - - The actual amount of total physical memory will either be - auto detected or can be forced by using a kernel command line option - such as "mem=256M". (Try "man bootparam" or see the documentation of - your boot loader (lilo or loadlin) about how to pass options to the - kernel at boot time.) - - If unsure, say "off". - -config HIGHMEM4G - bool "4GB" - help - Select this if you have a 32-bit processor and between 1 and 4 - gigabytes of physical RAM. - -config HIGHMEM64G - bool "64GB" - depends on X86_HAVE_PAE - select X86_PAE - help - Select this if you have a 32-bit processor and more than 4 - gigabytes of physical RAM. - -endchoice + If unsure, say N. choice prompt "Memory split" if EXPERT @@ -1489,8 +1456,7 @@ config PAGE_OFFSET depends on X86_32 config HIGHMEM - def_bool y - depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) + def_bool HIGHMEM4G config X86_PAE bool "PAE (Physical Address Extension) Support" diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index 581296255b39..d5d091e03bd3 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -1,6 +1,4 @@ # global x86 required specific stuff -# On 32-bit HIGHMEM4G is not allowed -CONFIG_HIGHMEM64G=y CONFIG_64BIT=y # These enable us to allow some of the diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index faf9cc1c14bb..25c32652f404 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -11,8 +11,8 @@ * a virtual address space of one gigabyte, which limits the * amount of physical memory you can use to about 950MB. * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. + * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G + * and CONFIG_HIGHMEM4G options in the kernel configuration. */ #define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL) #define __PAGE_OFFSET __PAGE_OFFSET_BASE diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac41b1e0940d..f288aad8dc74 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -582,7 +582,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" + "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: @@ -606,18 +606,13 @@ static void __init highmem_pfn_init(void) #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } -#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } -- cgit v1.2.3 From a8331594036f22dcf037f1a75358bd0985c84cd9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:10 +0100 Subject: x86/mm: Drop CONFIG_SWIOTLB for PAE Since kernels with and without CONFIG_X86_PAE are now limited to the low 4GB of physical address space, there is no need to use swiotlb any more, so stop selecting this. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-7-arnd@kernel.org --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 737a0c630527..0e0ec2c8ef75 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1462,7 +1462,6 @@ config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && X86_HAVE_PAE select PHYS_ADDR_T_64BIT - select SWIOTLB help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It -- cgit v1.2.3 From 0081fdeccbf610499b79784998b1fd36783209dd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:11 +0100 Subject: x86/mm: Drop support for CONFIG_HIGHPTE With the maximum amount of RAM now 4GB, there is very little point to still have PTE pages in highmem. Drop this for simplification. The only other architecture supporting HIGHPTE is 32-bit arm, and once that feature is removed as well, the highpte logic can be dropped from common code as well. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-8-arnd@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 7 ------- arch/x86/Kconfig | 9 --------- arch/x86/include/asm/pgalloc.h | 5 ----- arch/x86/mm/pgtable.c | 27 +------------------------ 4 files changed, 1 insertion(+), 47 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8f923770a566..93177630cefb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7668,13 +7668,6 @@ 16 - SIGBUS faults Example: user_debug=31 - userpte= - [X86,EARLY] Flags controlling user PTE allocations. - - nohigh = do not allocate PTE pages in - HIGHMEM regardless of setting - of CONFIG_HIGHPTE. - vdso= [X86,SH,SPARC] On X86_32, this is an alias for vdso32=. Otherwise: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0e0ec2c8ef75..73eeaf295b74 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1628,15 +1628,6 @@ config X86_PMEM_LEGACY Say Y if unsure. -config HIGHPTE - bool "Allocate 3rd-level pagetables from highmem" - depends on HIGHMEM - help - The VM uses one page table entry for each page of physical memory. - For systems with a lot of RAM, this can be wasteful of precious - low memory. Setting this option will put user-space page table - entries in high memory. - config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" help diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd4841231bb9..a33147520044 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -29,11 +29,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -/* - * Flags to use when allocating a user page table page. - */ -extern gfp_t __userpte_alloc_gfp; - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b1c1f72c1fd1..cec321fb74f2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -12,35 +12,10 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif -#ifdef CONFIG_HIGHPTE -#define PGTABLE_HIGHMEM __GFP_HIGHMEM -#else -#define PGTABLE_HIGHMEM 0 -#endif - -gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; - pgtable_t pte_alloc_one(struct mm_struct *mm) { - return __pte_alloc_one(mm, __userpte_alloc_gfp); -} - -static int __init setup_userpte(char *arg) -{ - if (!arg) - return -EINVAL; - - /* - * "userpte=nohigh" disables allocation of user pagetables in - * high memory. - */ - if (strcmp(arg, "nohigh") == 0) - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - else - return -EINVAL; - return 0; + return __pte_alloc_one(mm, GFP_PGTABLE_USER); } -early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { -- cgit v1.2.3 From ca5955dd5f08727605723b60767fbf2cc3d54046 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:12 +0100 Subject: x86/cpu: Document CONFIG_X86_INTEL_MID as 64-bit-only The X86_INTEL_MID code was originally introduced for the 32-bit Moorestown/Medfield/Clovertrail platform, later the 64-bit Merrifield/Moorefield variants were added, but the final Morganfield 14nm platform was canceled before it hit the market. To help users understand what the option actually refers to, update the help text, and add a dependency on 64-bit kernels. Ferry confirmed that all the hardware can run 64-bit kernels these days, but is still testing 32-bit kernels on the Intel Edison board, so this remains possible, but is guarded by a CONFIG_EXPERT dependency now, to gently push remaining users towards using CONFIG_64BIT. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Acked-by: Andy Shevchenko Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-9-arnd@kernel.org --- arch/x86/Kconfig | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73eeaf295b74..acd4d73502d6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -549,12 +549,12 @@ config X86_EXTENDED_PLATFORM RDC R-321x SoC SGI 320/540 (Visual Workstation) STA2X11-based (e.g. Northville) - Moorestown MID devices 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip ScaleMP vSMP SGI Ultraviolet + Merrifield/Moorefield MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -599,8 +599,31 @@ config X86_UV This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. -# Following is an alphabetically sorted list of 32 bit extended platforms -# Please maintain the alphabetic order if and when there are additions +config X86_INTEL_MID + bool "Intel Z34xx/Z35xx MID platform support" + depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES + depends on PCI + depends on X86_64 || (EXPERT && PCI_GOANY) + depends on X86_IO_APIC + select I2C + select DW_APB_TIMER + select INTEL_SCU_PCI + help + Select to build a kernel capable of supporting 64-bit Intel MID + (Mobile Internet Device) platform systems which do not have + the PCI legacy interfaces. + + The only supported devices are the 22nm Merrified (Z34xx) + and Moorefield (Z35xx) SoC used in the Intel Edison board and + a small number of Android devices such as the Asus Zenfone 2, + Asus FonePad 8 and Dell Venue 7. + + If you are building for a PC class system or non-MID tablet + SoCs like Bay Trail (Z36xx/Z37xx), say N here. + + Intel MID platforms are based on an Intel processor and chipset which + consume less power than most of the x86 derivatives. config X86_GOLDFISH bool "Goldfish (Virtual Platform)" @@ -610,6 +633,9 @@ config X86_GOLDFISH for Android development. Unless you are building for the Android Goldfish emulator say N here. +# Following is an alphabetically sorted list of 32 bit extended platforms +# Please maintain the alphabetic order if and when there are additions + config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -625,24 +651,6 @@ config X86_INTEL_CE This option compiles in support for the CE4100 SOC for settop boxes and media devices. -config X86_INTEL_MID - bool "Intel MID platform support" - depends on X86_EXTENDED_PLATFORM - depends on X86_PLATFORM_DEVICES - depends on PCI - depends on X86_64 || (PCI_GOANY && X86_32) - depends on X86_IO_APIC - select I2C - select DW_APB_TIMER - select INTEL_SCU_PCI - help - Select to build a kernel capable of supporting Intel MID (Mobile - Internet Device) platform systems which do not have the PCI legacy - interfaces. If you are building for a PC class system say N here. - - Intel MID platforms are based on an Intel processor and chipset which - consume less power than most of the x86 derivatives. - config X86_INTEL_QUARK bool "Intel Quark platform support" depends on X86_32 -- cgit v1.2.3 From dcbb01fbb7aeed0fae4dc1389a36842c77f4f381 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:13 +0100 Subject: x86/pci: Remove old STA2x11 support ST ConneXt STA2x11 was an interface chip for Atom E6xx processors, using a number of components usually found on Arm SoCs. Most of this was merged upstream, but it was never complete enough to actually work and has been abandoned for many years. We already had an agreement on removing it in 2022, but nobody ever submitted the patch to do it. Without STA2x11, CONFIG_X86_32_NON_STANDARD no longer has any use - remove it. Suggested-by: Davide Ciminaghi Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Bjorn Helgaas Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-10-arnd@kernel.org --- arch/x86/Kconfig | 32 +----- arch/x86/include/asm/sta2x11.h | 13 --- arch/x86/pci/Makefile | 2 - arch/x86/pci/sta2x11-fixup.c | 233 ----------------------------------------- 4 files changed, 3 insertions(+), 277 deletions(-) delete mode 100644 arch/x86/include/asm/sta2x11.h delete mode 100644 arch/x86/pci/sta2x11-fixup.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index acd4d73502d6..383b145ffe9f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -548,7 +548,6 @@ config X86_EXTENDED_PLATFORM AMD Elan RDC R-321x SoC SGI 320/540 (Visual Workstation) - STA2X11-based (e.g. Northville) 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip @@ -732,18 +731,6 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config X86_32_NON_STANDARD - bool "Support non-standard 32-bit SMP architectures" - depends on X86_32 && SMP - depends on X86_EXTENDED_PLATFORM - help - This option compiles in the STA2X11 default - subarchitecture. It is intended for a generic binary - kernel. If you select them all, kernel will probe it one by - one and will fallback to default. - -# Alphabetically sorted list of Non standard 32 bit platforms - config X86_SUPPORTS_MEMORY_FAILURE def_bool y # MCE code calls memory_failure(): @@ -753,19 +740,6 @@ config X86_SUPPORTS_MEMORY_FAILURE depends on X86_64 || !SPARSEMEM select ARCH_SUPPORTS_MEMORY_FAILURE -config STA2X11 - bool "STA2X11 Companion Chip Support" - depends on X86_32_NON_STANDARD && PCI - select SWIOTLB - select MFD_STA2X11 - select GPIOLIB - help - This adds support for boards based on the STA2X11 IO-Hub, - a.k.a. "ConneXt". The chip is used in place of the standard - PC chipset, so all "standard" peripherals are missing. If this - option is selected the kernel will still be able to boot on - standard PC machines. - config X86_32_IRIS tristate "Eurobraille/Iris poweroff module" depends on X86_32 @@ -1103,7 +1077,7 @@ config UP_LATE_INIT config X86_UP_APIC bool "Local APIC support on uniprocessors" if !PCI_MSI default PCI_MSI - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -1128,7 +1102,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI + depends on X86_64 || SMP || X86_UP_APIC || PCI_MSI select IRQ_DOMAIN_HIERARCHY config ACPI_MADT_WAKEUP @@ -1590,7 +1564,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD + depends on X86_64 || NUMA || X86_32 select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h deleted file mode 100644 index e0975e9c4f47..000000000000 --- a/arch/x86/include/asm/sta2x11.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Header file for STMicroelectronics ConneXt (STA2X11) IOHub - */ -#ifndef __ASM_STA2X11_H -#define __ASM_STA2X11_H - -#include - -/* This needs to be called from the MFD to configure its sub-devices */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); - -#endif /* __ASM_STA2X11_H */ diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 48bcada5cabe..4933fb337983 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -12,8 +12,6 @@ obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o -obj-$(CONFIG_STA2X11) += sta2x11-fixup.o - obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c deleted file mode 100644 index 8c8ddc4dcc08..000000000000 --- a/arch/x86/pci/sta2x11-fixup.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DMA translation between STA2x11 AMBA memory mapping and the x86 memory mapping - * - * ST Microelectronics ConneXt (STA2X11/STA2X10) - * - * Copyright (c) 2010-2011 Wind River Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define STA2X11_SWIOTLB_SIZE (4*1024*1024) - -/* - * We build a list of bus numbers that are under the ConneXt. The - * main bridge hosts 4 busses, which are the 4 endpoints, in order. - */ -#define STA2X11_NR_EP 4 /* 0..3 included */ -#define STA2X11_NR_FUNCS 8 /* 0..7 included */ -#define STA2X11_AMBA_SIZE (512 << 20) - -struct sta2x11_ahb_regs { /* saved during suspend */ - u32 base, pexlbase, pexhbase, crw; -}; - -struct sta2x11_mapping { - int is_suspended; - struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS]; -}; - -struct sta2x11_instance { - struct list_head list; - int bus0; - struct sta2x11_mapping map[STA2X11_NR_EP]; -}; - -static LIST_HEAD(sta2x11_instance_list); - -/* At probe time, record new instances of this bridge (likely one only) */ -static void sta2x11_new_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = kzalloc(sizeof(*instance), GFP_ATOMIC); - if (!instance) - return; - /* This has a subordinate bridge, with 4 more-subordinate ones */ - instance->bus0 = pdev->subordinate->number + 1; - - if (list_empty(&sta2x11_instance_list)) { - int size = STA2X11_SWIOTLB_SIZE; - /* First instance: register your own swiotlb area */ - dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size, GFP_DMA, NULL)) - dev_emerg(&pdev->dev, "init swiotlb failed\n"); - } - list_add(&instance->list, &sta2x11_instance_list); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance); - -/* - * Utility functions used in this file from below - */ -static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - list_for_each_entry(instance, &sta2x11_instance_list, list) { - ep = pdev->bus->number - instance->bus0; - if (ep >= 0 && ep < STA2X11_NR_EP) - return instance; - } - return NULL; -} - -static int sta2x11_pdev_to_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return -1; - - return pdev->bus->number - instance->bus0; -} - -/* This is exported, as some devices need to access the MFD registers */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev) -{ - return sta2x11_pdev_to_instance(pdev); -} -EXPORT_SYMBOL(sta2x11_get_instance); - -/* At setup time, we use our own ops if the device is a ConneXt one */ -static void sta2x11_setup_pdev(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - - if (!instance) /* either a sta2x11 bridge or another ST device */ - return; - - /* We must enable all devices as master, for audio DMA to work */ - pci_set_master(pdev); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev); - -/* - * At boot we must set up the mappings for the pcie-to-amba bridge. - * It involves device access, and the same happens at suspend/resume time - */ - -#define AHB_MAPB 0xCA4 -#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10) -#define AHB_CRW_SZMASK 0xfffffc00UL -#define AHB_CRW_ENABLE (1 << 0) -#define AHB_CRW_WTYPE_MEM (2 << 1) -#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */ -#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */ -#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10) -#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10) -#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10) - -/* At probe time, enable mapping for each endpoint, using the pdev */ -static void sta2x11_map_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - struct device *dev = &pdev->dev; - u32 amba_base, max_amba_addr; - int i, ret; - - if (!instance) - return; - - pci_read_config_dword(pdev, AHB_BASE(0), &amba_base); - max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1; - - ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE); - if (ret) - dev_err(dev, "sta2x11: could not set DMA offset\n"); - - dev->bus_dma_limit = max_amba_addr; - dma_set_mask_and_coherent(&pdev->dev, max_amba_addr); - - /* Configure AHB mapping */ - pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0); - pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0); - pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE | - AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE); - - /* Disable all the other windows */ - for (i = 1; i < STA2X11_NR_FUNCS; i++) - pci_write_config_dword(pdev, AHB_CRW(i), 0); - - dev_info(&pdev->dev, - "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n", - sta2x11_pdev_to_ep(pdev), amba_base, max_amba_addr); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep); - -#ifdef CONFIG_PM /* Some register values must be saved and restored */ - -static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return NULL; - ep = sta2x11_pdev_to_ep(pdev); - return instance->map + ep; -} - -static void suspend_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - if (map->is_suspended) - return; - map->is_suspended = 1; - - /* Save all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_read_config_dword(pdev, AHB_BASE(i), ®s->base); - pci_read_config_dword(pdev, AHB_PEXLBASE(i), ®s->pexlbase); - pci_read_config_dword(pdev, AHB_PEXHBASE(i), ®s->pexhbase); - pci_read_config_dword(pdev, AHB_CRW(i), ®s->crw); - } -} -DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping); - -static void resume_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - - if (!map->is_suspended) - goto out; - map->is_suspended = 0; - - /* Restore all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_write_config_dword(pdev, AHB_BASE(i), regs->base); - pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase); - pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase); - pci_write_config_dword(pdev, AHB_CRW(i), regs->crw); - } -out: - pci_set_master(pdev); /* Like at boot, enable master on all devices */ -} -DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping); - -#endif /* CONFIG_PM */ -- cgit v1.2.3 From 976ba8da2f3c2f1e997f4f620da83ae65c0e3728 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:14 +0100 Subject: x86/platform: Only allow CONFIG_EISA for 32-bit The CONFIG_EISA menu was cleaned up in 2018, but this inadvertently brought the option back on 64-bit machines: ISA remains guarded by a CONFIG_X86_32 check, but EISA no longer depends on ISA. The last Intel machines ith EISA support used a 82375EB PCI/EISA bridge from 1993 that could be paired with the 440FX chipset on early Pentium-II CPUs, long before the first x86-64 products. Fixes: 6630a8e50105 ("eisa: consolidate EISA Kconfig entry in drivers/eisa") Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-11-arnd@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 383b145ffe9f..aa90f0355be1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -233,7 +233,7 @@ config X86 select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_EISA + select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE -- cgit v1.2.3 From 4a412c70af674198749fd16be695d53e1c41b5f9 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:24 -0800 Subject: x86/cpu: Prefix hexadecimal values with 0x in cpu_debug_show() The hex values in CPU debug interface are not prefixed with 0x. This may cause misinterpretation of values. Fix it. [ mingo: Restore previous vertical alignment of the output. ] Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-1-2ae010f50370@linux.intel.com --- arch/x86/kernel/cpu/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index cacfd3f6abef..1976fef2dfe5 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p) if (!c->initialized) return 0; - seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); - seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); -- cgit v1.2.3 From b52aaeeadfac54c91005e044b72b62616a5864a9 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:30 -0800 Subject: cpufreq: intel_pstate: Avoid SMP calls to get cpu-type Intel pstate driver relies on SMP calls to get the cpu-type of a given CPU. Remove the SMP calls and instead use the cached value of cpu-type which is more efficient. [ mingo: Forward ported it. ] Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Srinivas Pandruvada Acked-by: Rafael J. Wysocki Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-2-2ae010f50370@linux.intel.com --- drivers/cpufreq/intel_pstate.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9c4cc01fd51a..f06b9bc99945 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2200,28 +2200,20 @@ static int knl_get_turbo_pstate(int cpu) return ret; } -static void hybrid_get_type(void *data) -{ - u8 *cpu_type = data; - - *cpu_type = get_this_hybrid_cpu_type(); -} - static int hwp_get_cpu_scaling(int cpu) { if (hybrid_scaling_factor) { - u8 cpu_type = 0; - - smp_call_function_single(cpu, hybrid_get_type, &cpu_type, 1); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u8 cpu_type = c->topo.intel_type; /* * Return the hybrid scaling factor for P-cores and use the * default core scaling for E-cores. */ - if (cpu_type == 0x40) + if (cpu_type == INTEL_CPU_TYPE_CORE) return hybrid_scaling_factor; - if (cpu_type == 0x20) + if (cpu_type == INTEL_CPU_TYPE_ATOM) return core_get_scaling(); } -- cgit v1.2.3 From c4a8b7116b9927f7b00bd68140e285662a03068e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:36 -0800 Subject: perf/x86/intel: Use cache cpu-type for hybrid PMU selection get_this_hybrid_cpu_type() misses a case when cpu-type is populated regardless of X86_FEATURE_HYBRID_CPU. This is particularly true for hybrid variants that have P or E cores fused off. Instead use the cpu-type cached in struct x86_topology, as it does not rely on hybrid feature to enumerate cpu-type. This can also help avoid the model-specific fixup get_hybrid_cpu_type(). Also replace the get_this_hybrid_cpu_native_id() with its cached value in struct x86_topology. While at it, remove enum hybrid_cpu_type as it serves no purpose when we have the exact cpu-types defined in enum intel_cpu_type. Also rename atom_native_id to intel_native_id and move it to intel-family.h where intel_cpu_type lives. Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-3-2ae010f50370@linux.intel.com --- arch/x86/events/intel/core.c | 19 ++++++++++--------- arch/x86/events/perf_event.h | 19 +------------------ arch/x86/include/asm/intel-family.h | 15 ++++++++++++++- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3cf65e93a03f..397c545b8610 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4606,9 +4606,9 @@ static int adl_hw_config(struct perf_event *event) return -EOPNOTSUPP; } -static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) +static enum intel_cpu_type adl_get_hybrid_cpu_type(void) { - return HYBRID_INTEL_CORE; + return INTEL_CPU_TYPE_CORE; } static inline bool erratum_hsw11(struct perf_event *event) @@ -4953,7 +4953,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) { - u8 cpu_type = get_this_hybrid_cpu_type(); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + enum intel_cpu_type cpu_type = c->topo.intel_type; int i; /* @@ -4962,7 +4963,7 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) * on it. There should be a fixup function provided for these * troublesome CPUs (->get_hybrid_cpu_type). */ - if (cpu_type == HYBRID_INTEL_NONE) { + if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) { if (x86_pmu.get_hybrid_cpu_type) cpu_type = x86_pmu.get_hybrid_cpu_type(); else @@ -4979,16 +4980,16 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; u32 native_id; - if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big) + if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big) return &x86_pmu.hybrid_pmu[i]; - if (cpu_type == HYBRID_INTEL_ATOM) { + if (cpu_type == INTEL_CPU_TYPE_ATOM) { if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - native_id = get_this_hybrid_cpu_native_id(); - if (native_id == skt_native_id && pmu_type == hybrid_small) + native_id = c->topo.intel_native_model_id; + if (native_id == INTEL_ATOM_SKT_NATIVE_ID && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - if (native_id == cmt_native_id && pmu_type == hybrid_tiny) + if (native_id == INTEL_ATOM_CMT_NATIVE_ID && pmu_type == hybrid_tiny) return &x86_pmu.hybrid_pmu[i]; } } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 31c2771545a6..7b18754084a6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -669,18 +669,6 @@ enum { #define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) -/* - * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture - * of the core. Bits 31-24 indicates its core type (Core or Atom) - * and Bits [23:0] indicates the native model ID of the core. - * Core type and native model ID are defined in below enumerations. - */ -enum hybrid_cpu_type { - HYBRID_INTEL_NONE, - HYBRID_INTEL_ATOM = 0x20, - HYBRID_INTEL_CORE = 0x40, -}; - #define X86_HYBRID_PMU_ATOM_IDX 0 #define X86_HYBRID_PMU_CORE_IDX 1 #define X86_HYBRID_PMU_TINY_IDX 2 @@ -697,11 +685,6 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; -enum atom_native_id { - cmt_native_id = 0x2, /* Crestmont */ - skt_native_id = 0x3, /* Skymont */ -}; - struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -994,7 +977,7 @@ struct x86_pmu { */ int num_hybrid_pmus; struct x86_hybrid_pmu *hybrid_pmu; - enum hybrid_cpu_type (*get_hybrid_cpu_type) (void); + enum intel_cpu_type (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f9f67afeb48a..b657d78071c6 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -182,10 +182,23 @@ /* Family 19 */ #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ -/* CPU core types */ +/* + * Intel CPU core types + * + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture + * of the core. Bits 31-24 indicates its core type (Core or Atom) + * and Bits [23:0] indicates the native model ID of the core. + * Core type and native model ID are defined in below enumerations. + */ enum intel_cpu_type { + INTEL_CPU_TYPE_UNKNOWN, INTEL_CPU_TYPE_ATOM = 0x20, INTEL_CPU_TYPE_CORE = 0x40, }; +enum intel_native_id { + INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */ + INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */ +}; + #endif /* _ASM_X86_INTEL_FAMILY_H */ -- cgit v1.2.3 From db5157df149709c02e6a08c0b3498553bdd2a76c Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:41 -0800 Subject: x86/cpu: Remove get_this_hybrid_cpu_*() Because calls to get_this_hybrid_cpu_type() and get_this_hybrid_cpu_native_id() are not required now. cpu-type and native-model-id are cached at boot in per-cpu struct cpuinfo_topology. Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-4-2ae010f50370@linux.intel.com --- arch/x86/include/asm/cpu.h | 14 -------------- arch/x86/kernel/cpu/intel.c | 31 ------------------------------- 2 files changed, 45 deletions(-) diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 98eced5084ca..0c8ec62789a1 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -50,20 +50,6 @@ static inline void split_lock_init(void) {} static inline void bus_lock_init(void) {} #endif -#ifdef CONFIG_CPU_SUP_INTEL -u8 get_this_hybrid_cpu_type(void); -u32 get_this_hybrid_cpu_native_id(void); -#else -static inline u8 get_this_hybrid_cpu_type(void) -{ - return 0; -} - -static inline u32 get_this_hybrid_cpu_native_id(void) -{ - return 0; -} -#endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3dce22f00dc3..045b439c653a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -873,34 +873,3 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); - -#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 - -/** - * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU - * - * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in - * a hybrid processor. If the processor is not hybrid, returns 0. - */ -u8 get_this_hybrid_cpu_type(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; -} - -/** - * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU - * - * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. - * If the processor is not hybrid, returns 0. - */ -u32 get_this_hybrid_cpu_native_id(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) & - (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); -} -- cgit v1.2.3 From b6762467a09ba8838c499e4f36561e82fc608ed1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 27 Feb 2025 15:06:58 +0100 Subject: x86/percpu: Disable named address spaces for UBSAN_BOOL with KASAN for GCC < 14.2 GCC < 14.2 does not correctly propagate address space qualifiers with -fsanitize=bool,enum. Together with address sanitizer then causes that load to be sanitized. Disable named address spaces for GCC < 14.2 when both, UBSAN_BOOL and KASAN are enabled. Reported-by: Matt Fleming Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250227140715.2276353-1-ubizjak@gmail.com Closes: https://lore.kernel.org/lkml/20241213190119.3449103-1-matt@readmodwrite.com/ --- arch/x86/Kconfig | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6595b35dd52d..867ec8a4a9d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2441,18 +2441,20 @@ config CC_HAS_NAMED_AS def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null) depends on CC_IS_GCC +# +# -fsanitize=kernel-address (KASAN) and -fsanitize=thread (KCSAN) +# are incompatible with named address spaces with GCC < 13.3 +# (see GCC PR sanitizer/111736 and also PR sanitizer/115172). +# + config CC_HAS_NAMED_AS_FIXED_SANITIZERS - def_bool CC_IS_GCC && GCC_VERSION >= 130300 + def_bool y + depends on !(KASAN || KCSAN) || GCC_VERSION >= 130300 + depends on !(UBSAN_BOOL && KASAN) || GCC_VERSION >= 140200 config USE_X86_SEG_SUPPORT - def_bool y - depends on CC_HAS_NAMED_AS - # - # -fsanitize=kernel-address (KASAN) and -fsanitize=thread - # (KCSAN) are incompatible with named address spaces with - # GCC < 13.3 - see GCC PR sanitizer/111736. - # - depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS + def_bool CC_HAS_NAMED_AS + depends on CC_HAS_NAMED_AS_FIXED_SANITIZERS config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) -- cgit v1.2.3 From 18cdd90aba794333f4c6dce39f5c3fe642af5575 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 27 Feb 2025 14:53:02 -0500 Subject: x86/bpf: Fix BPF percpu accesses Due to this recent commit in the x86 tree: 9d7de2aa8b41 ("Use relative percpu offsets") percpu addresses went from positive offsets from the GSBASE to negative kernel virtual addresses. The BPF verifier has an optimization for x86-64 that loads the address of cpu_number into a register, but was only doing a 32-bit load which truncates negative addresses. Change it to a 64-bit load so that the address is properly sign-extended. Fixes: 9d7de2aa8b41 ("Use relative percpu offsets") Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Uros Bizjak Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250227195302.1667654-1-brgerst@gmail.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9971c03adfd5..f74263b206e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21692,7 +21692,7 @@ patch_map_ops_generic: * way, it's fine to back out this inlining logic */ #ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; -- cgit v1.2.3 From ad546940b5991d3e141238cd80a6d1894b767184 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Tue, 26 Nov 2024 10:45:28 -0800 Subject: x86/ia32: Leave NULL selector values 0~3 unchanged The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0 and 1 of a segment selector, i.e., the RPL bits, are NOT used to index GDT, selector values 0~3 all point to the NULL descriptor, thus values 0, 1, 2 and 3 are all valid NULL selector values. When a NULL selector value is to be loaded into a segment register, reload_segments() sets its RPL bits. Later IRET zeros ES, FS, GS, and DS segment registers if any of them is found to have any nonzero NULL selector value. The two operations offset each other to actually effect a nop. Besides, zeroing of RPL in NULL selector values is an information leak in pre-FRED systems as userspace can spot any interrupt/exception by loading a nonzero NULL selector, and waiting for it to become zero. But there is nothing software can do to prevent it before FRED. ERETU, the only legit instruction to return to userspace from kernel under FRED, by design does NOT zero any segment register to avoid this problem behavior. As such, leave NULL selector values 0~3 unchanged and close the leak. Do the same on 32-bit kernel as well. Signed-off-by: Xin Li (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Andrew Cooper Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Brian Gerst Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20241126184529.1607334-1-xin@zytor.com --- arch/x86/kernel/signal_32.c | 62 +++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index ef654530bf5a..98123ff10506 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -33,25 +33,55 @@ #include #include +/* + * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0 + * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index + * GDT, selector values 0~3 all point to the NULL descriptor, thus values + * 0, 1, 2 and 3 are all valid NULL selector values. + * + * However IRET zeros ES, FS, GS, and DS segment registers if any of them + * is found to have any nonzero NULL selector value, which can be used by + * userspace in pre-FRED systems to spot any interrupt/exception by loading + * a nonzero NULL selector and waiting for it to become zero. Before FRED + * there was nothing software could do to prevent such an information leak. + * + * ERETU, the only legit instruction to return to userspace from kernel + * under FRED, by design does NOT zero any segment register to avoid this + * problem behavior. + * + * As such, leave NULL selector values 0~3 unchanged. + */ +static inline u16 fixup_rpl(u16 sel) +{ + return sel <= 3 ? sel : sel | 3; +} + #ifdef CONFIG_IA32_EMULATION #include static inline void reload_segments(struct sigcontext_32 *sc) { - unsigned int cur; + u16 cur; + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ savesegment(gs, cur); - if ((sc->gs | 0x03) != cur) - load_gs_index(sc->gs | 0x03); + if (fixup_rpl(sc->gs) != cur) + load_gs_index(fixup_rpl(sc->gs)); savesegment(fs, cur); - if ((sc->fs | 0x03) != cur) - loadsegment(fs, sc->fs | 0x03); + if (fixup_rpl(sc->fs) != cur) + loadsegment(fs, fixup_rpl(sc->fs)); + savesegment(ds, cur); - if ((sc->ds | 0x03) != cur) - loadsegment(ds, sc->ds | 0x03); + if (fixup_rpl(sc->ds) != cur) + loadsegment(ds, fixup_rpl(sc->ds)); savesegment(es, cur); - if ((sc->es | 0x03) != cur) - loadsegment(es, sc->es | 0x03); + if (fixup_rpl(sc->es) != cur) + loadsegment(es, fixup_rpl(sc->es)); } #define sigset32_t compat_sigset_t @@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, regs->orig_ax = -1; #ifdef CONFIG_IA32_EMULATION - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ reload_segments(&sc); #else - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; + loadsegment(gs, fixup_rpl(sc.gs)); + regs->fs = fixup_rpl(sc.fs); + regs->es = fixup_rpl(sc.es); + regs->ds = fixup_rpl(sc.ds); #endif return fpu__restore_sig(compat_ptr(sc.fpstate), 1); -- cgit v1.2.3 From a4248ee16f411ac1ea7dfab228a6659b111e3d65 Mon Sep 17 00:00:00 2001 From: Max Grobecker Date: Thu, 27 Feb 2025 21:45:05 +0100 Subject: x86/cpu: Don't clear X86_FEATURE_LAHF_LM flag in init_amd_k8() on AMD when running in a virtual machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running in a virtual machine, we might see the original hardware CPU vendor string (i.e. "AuthenticAMD"), but a model and family ID set by the hypervisor. In case we run on AMD hardware and the hypervisor sets a model ID < 0x14, the LAHF cpu feature is eliminated from the the list of CPU capabilities present to circumvent a bug with some BIOSes in conjunction with AMD K8 processors. Parsing the flags list from /proc/cpuinfo seems to be happening mostly in bash scripts and prebuilt Docker containers, as it does not need to have additionals tools present – even though more reliable ways like using "kcpuid", which calls the CPUID instruction instead of parsing a list, should be preferred. Scripts, that use /proc/cpuinfo to determine if the current CPU is "compliant" with defined microarchitecture levels like x86-64-v2 will falsely claim the CPU is incapable of modern CPU instructions when "lahf_lm" is missing in that flags list. This can prevent some docker containers from starting or build scripts to create unoptimized binaries. Admittably, this is more a small inconvenience than a severe bug in the kernel and the shoddy scripts that rely on parsing /proc/cpuinfo should be fixed instead. This patch adds an additional check to see if we're running inside a virtual machine (X86_FEATURE_HYPERVISOR is present), which, to my understanding, can't be present on a real K8 processor as it was introduced only with the later/other Athlon64 models. Example output with the "lahf_lm" flag missing in the flags list (should be shown between "hypervisor" and "abm"): $ cat /proc/cpuinfo processor : 0 vendor_id : AuthenticAMD cpu family : 15 model : 6 model name : Common KVM processor stepping : 1 microcode : 0x1000065 cpu MHz : 2599.998 cache size : 512 KB physical id : 0 siblings : 1 core id : 0 cpu cores : 1 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx rdtscp lm rep_good nopl cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c hypervisor abm 3dnowprefetch vmmcall bmi1 avx2 bmi2 xsaveopt ... while kcpuid shows the feature to be present in the CPU: # kcpuid -d | grep lahf lahf_lm - LAHF/SAHF available in 64-bit mode [ mingo: Updated the comment a bit, incorporated Boris's review feedback. ] Signed-off-by: Max Grobecker Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: Borislav Petkov --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 54194f5995de..d747515ad013 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -632,7 +632,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) * (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); if (!rdmsrl_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); -- cgit v1.2.3 From f034937f5af32188cd1c07865c885b2f171e17bf Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 20 Dec 2024 15:18:31 +0000 Subject: x86/cpu: Create helper function to parse the 'clearcpuid=' boot parameter This is in preparation for a later commit that will reuse this code, to make review convenient. Factor out a helper function which does the full handling for this arg including printing info to the console. No functional change intended. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220-force-cpu-bug-v2-1-7dc71bce742a@google.com --- arch/x86/kernel/cpu/common.c | 96 ++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 76598a93a8fa..137d3e00a5be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1479,56 +1479,18 @@ static void detect_nopl(void) #endif } -/* - * We parse cpu parameters early because fpu__init_system() is executed - * before parse_early_param(). - */ -static void __init cpu_parse_early_param(void) +static inline void parse_clearcpuid(char *arg) { - char arg[128]; - char *argptr = arg, *opt; - int arglen, taint = 0; - -#ifdef CONFIG_X86_32 - if (cmdline_find_option_bool(boot_command_line, "no387")) -#ifdef CONFIG_MATH_EMULATION - setup_clear_cpu_cap(X86_FEATURE_FPU); -#else - pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); -#endif - - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) - setup_clear_cpu_cap(X86_FEATURE_FXSR); -#endif - - if (cmdline_find_option_bool(boot_command_line, "noxsave")) - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - - if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - if (cmdline_find_option_bool(boot_command_line, "nousershstk")) - setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); - - /* Minimize the gap between FRED is available and available but disabled. */ - arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); - if (arglen != 2 || strncmp(arg, "on", 2)) - setup_clear_cpu_cap(X86_FEATURE_FRED); - - arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; + char *opt; + int taint = 0; pr_info("Clearing CPUID bits:"); - while (argptr) { + while (arg) { bool found __maybe_unused = false; unsigned int bit; - opt = strsep(&argptr, ","); + opt = strsep(&arg, ","); /* * Handle naked numbers first for feature flags which don't @@ -1570,10 +1532,56 @@ static void __init cpu_parse_early_param(void) if (!found) pr_cont(" (unknown: %s)", opt); } - pr_cont("\n"); if (taint) add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + pr_cont("\n"); +} + + +/* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + int arglen; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + + /* Minimize the gap between FRED is available and available but disabled. */ + arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); + if (arglen != 2 || strncmp(arg, "on", 2)) + setup_clear_cpu_cap(X86_FEATURE_FRED); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + parse_clearcpuid(arg); } /* -- cgit v1.2.3 From 814165e9fd1f62332b5444d730b8d6e432328463 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 20 Dec 2024 15:18:32 +0000 Subject: x86/cpu: Add the 'setcpuid=' boot parameter In preparation for adding support to inject fake CPU bugs at boot-time, add a general facility to force enablement of CPU flags. The flag taints the kernel and the documentation attempts to be clear that this is highly unsuitable for uses outside of kernel development and platform experimentation. The new arg is parsed just like clearcpuid, but instead of leading to setup_clear_cpu_cap() it leads to setup_force_cpu_cap(). I've tested this by booting a nested QEMU guest on an Intel host, which with setcpuid=svm will claim that it supports AMD virtualization. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220-force-cpu-bug-v2-2-7dc71bce742a@google.com --- arch/x86/kernel/cpu/common.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 137d3e00a5be..ff483c9a56c3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1479,12 +1479,12 @@ static void detect_nopl(void) #endif } -static inline void parse_clearcpuid(char *arg) +static inline void parse_set_clear_cpuid(char *arg, bool set) { char *opt; int taint = 0; - pr_info("Clearing CPUID bits:"); + pr_info("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); while (arg) { bool found __maybe_unused = false; @@ -1505,7 +1505,10 @@ static inline void parse_clearcpuid(char *arg) else pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); - setup_clear_cpu_cap(bit); + if (set) + setup_force_cpu_cap(bit); + else + setup_clear_cpu_cap(bit); taint++; } /* @@ -1523,7 +1526,10 @@ static inline void parse_clearcpuid(char *arg) continue; pr_cont(" %s", opt); - setup_clear_cpu_cap(bit); + if (set) + setup_force_cpu_cap(bit); + else + setup_clear_cpu_cap(bit); taint++; found = true; break; @@ -1579,9 +1585,12 @@ static void __init cpu_parse_early_param(void) setup_clear_cpu_cap(X86_FEATURE_FRED); arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; - parse_clearcpuid(arg); + if (arglen > 0) + parse_set_clear_cpuid(arg, false); + + arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); + if (arglen > 0) + parse_set_clear_cpuid(arg, true); } /* @@ -2013,15 +2022,23 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy - * function prevents it from becoming an environment variable for init. + * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param(). + * These dummy functions prevent them from becoming an environment variable for + * init. */ + static __init int setup_clearcpuid(char *arg) { return 1; } __setup("clearcpuid=", setup_clearcpuid); +static __init int setup_setcpuid(char *arg) +{ + return 1; +} +__setup("setcpuid=", setup_setcpuid); + DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, .preempt_count = INIT_PREEMPT_COUNT, -- cgit v1.2.3 From ab68d2e36532806b8f86ff2f60861dbb8443f0be Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 20 Dec 2024 15:18:33 +0000 Subject: x86/cpu: Enable modifying CPU bug flags with '{clear,set}puid=' Sometimes it can be very useful to run CPU vulnerability mitigations on systems where they aren't known to mitigate any real-world vulnerabilities. This can be handy for mundane reasons like debugging HW-agnostic logic on whatever machine is to hand, but also for research reasons: while some mitigations are focused on individual vulns and uarches, others are fairly general, and it's strategically useful to have an idea how they'd perform on systems where they aren't currently needed. As evidence for this being useful, a flag specifically for Retbleed was added in: 5c9a92dec323 ("x86/bugs: Add retbleed=force"). Since CPU bugs are tracked using the same basic mechanism as features, and there are already parameters for manipulating them by hand, extend that mechanism to support bug as well as capabilities. With this patch and setcpuid=srso, a QEMU guest running on an Intel host will boot with Safe-RET enabled. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220-force-cpu-bug-v2-3-7dc71bce742a@google.com --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/common.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index de1ad09fe8d7..e5fc0038c8f6 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -50,6 +50,7 @@ extern const char * const x86_power_flags[32]; * X86_BUG_ - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; +#define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ff483c9a56c3..0f32b6ffbf04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1494,7 +1494,8 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) /* * Handle naked numbers first for feature flags which don't - * have names. + * have names. It doesn't make sense for a bug not to have a + * name so don't handle bug flags here. */ if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { @@ -1518,11 +1519,18 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) continue; } - for (bit = 0; bit < 32 * NCAPINTS; bit++) { - if (!x86_cap_flag(bit)) + for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { + const char *flag; + + if (bit < 32 * NCAPINTS) + flag = x86_cap_flag(bit); + else + flag = x86_bug_flag(bit - (32 * NCAPINTS)); + + if (!flag) continue; - if (strcmp(x86_cap_flag(bit), opt)) + if (strcmp(flag, opt)) continue; pr_cont(" %s", opt); -- cgit v1.2.3 From 909639aa58fe4789644104c1fd89264c57da0979 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Fri, 28 Feb 2025 00:23:34 -0800 Subject: x86/cpufeatures: Rename X86_CMPXCHG64 to X86_CX8 Replace X86_CMPXCHG64 with X86_CX8, as CX8 is the name of the CPUID flag, thus to make it consistent with X86_FEATURE_CX8 defined in . No functional change intended. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Ingo Molnar Link: https://lore.kernel.org/r/20250228082338.73859-2-xin@zytor.com --- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.cpu | 4 ++-- arch/x86/include/asm/asm-prototypes.h | 2 +- arch/x86/include/asm/atomic64_32.h | 2 +- arch/x86/include/asm/cmpxchg_32.h | 2 +- arch/x86/include/asm/required-features.h | 2 +- arch/x86/lib/Makefile | 2 +- arch/x86/lib/cmpxchg8b_emu.S | 2 +- lib/atomic64_test.c | 2 +- tools/arch/x86/include/asm/required-features.h | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aa90f0355be1..017035f461cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -133,7 +133,7 @@ config X86 select ARCH_SUPPORTS_AUTOFDO_CLANG select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 + select ARCH_USE_CMPXCHG_LOCKREF if X86_CX8 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8fcb8ccee44b..f8b3296fe2e1 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -299,7 +299,7 @@ config X86_HAVE_PAE def_bool y depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64 -config X86_CMPXCHG64 +config X86_CX8 def_bool y depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX @@ -313,7 +313,7 @@ config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7) - default "5" if X86_32 && X86_CMPXCHG64 + default "5" if X86_32 && X86_CX8 default "4" config X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 3674006e3974..8d9e62725202 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -16,7 +16,7 @@ #include #include -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 extern void cmpxchg8b_emu(void); #endif diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6c6e9b9f98a4..797085ecaaa4 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -48,7 +48,7 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) ATOMIC64_EXPORT(atomic64_##sym) #endif -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 #define __alternative_atomic64(f, g, out, in...) \ asm volatile("call %c[func]" \ : ALT_OUTPUT_SP(out) \ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fd1282a783dd..c38d4ed94cb3 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -69,7 +69,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 #define arch_cmpxchg64 __cmpxchg64 diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index e9187ddd3d1f..0068133cb622 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -23,7 +23,7 @@ # define NEED_PAE 0 #endif -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 # define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) #else # define NEED_CX8 0 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8a59c61624c2..9bbe2819881f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -56,7 +56,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += string_32.o lib-y += memmove_32.o lib-y += cmpxchg8b_emu.o -ifneq ($(CONFIG_X86_CMPXCHG64),y) +ifneq ($(CONFIG_X86_CX8),y) lib-y += atomic64_386_32.o endif else diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 1c96be769adc..d4bb24347ff8 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,7 @@ .text -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 /* * Emulate 'cmpxchg8b (%esi)' on UP diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 759ea1783cc5..d726068358c7 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -254,7 +254,7 @@ static __init int test_atomics_init(void) pr_info("passed for %s platform %s CX8 and %s SSE\n", #ifdef CONFIG_X86_64 "x86-64", -#elif defined(CONFIG_X86_CMPXCHG64) +#elif defined(CONFIG_X86_CX8) "i586+", #else "i386+", diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index e9187ddd3d1f..0068133cb622 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -23,7 +23,7 @@ # define NEED_PAE 0 #endif -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 # define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) #else # define NEED_CX8 0 -- cgit v1.2.3 From 010c4a461c1dbf3fa75ddea8df018a6128b700c6 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:43 -0800 Subject: x86/speculation: Simplify and make CALL_NOSPEC consistent CALL_NOSPEC macro is used to generate Spectre-v2 mitigation friendly indirect branches. At compile time the macro defaults to indirect branch, and at runtime those can be patched to thunk based mitigations. This approach is opposite of what is done for the rest of the kernel, where the compile time default is to replace indirect calls with retpoline thunk calls. Make CALL_NOSPEC consistent with the rest of the kernel, default to retpoline thunk at compile time when CONFIG_MITIGATION_RETPOLINE is enabled. Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-1-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e8bf78c03d5..1e6b915ce956 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -424,16 +424,11 @@ static inline void call_depth_return_thunk(void) {} * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#ifdef CONFIG_MITIGATION_RETPOLINE +#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#else +#define CALL_NOSPEC "call *%[thunk_target]\n" +#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) -- cgit v1.2.3 From 9af9ad85ac44cb754e526d468c3006b48db5dfd8 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:58 -0800 Subject: x86/speculation: Add a conditional CS prefix to CALL_NOSPEC Retpoline mitigation for spectre-v2 uses thunks for indirect branches. To support this mitigation compilers add a CS prefix with -mindirect-branch-cs-prefix. For an indirect branch in asm, this needs to be added manually. CS prefix is already being added to indirect branches in asm files, but not in inline asm. Add CS prefix to CALL_NOSPEC for inline asm as well. There is no JMP_NOSPEC for inline asm. Reported-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-2-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1e6b915ce956..aee26bb8230f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -198,9 +198,8 @@ .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -420,12 +419,24 @@ static inline void call_depth_return_thunk(void) {} #ifdef CONFIG_X86_64 +/* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ #ifdef CONFIG_MITIGATION_RETPOLINE -#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" #else #define CALL_NOSPEC "call *%[thunk_target]\n" #endif -- cgit v1.2.3 From 8177c6bedb7013cf736137da586cf783922309dd Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:12 +0100 Subject: x86/cacheinfo: Validate CPUID leaf 0x2 EDX output CPUID leaf 0x2 emits one-byte descriptors in its four output registers EAX, EBX, ECX, and EDX. For these descriptors to be valid, the most significant bit (MSB) of each register must be clear. The historical Git commit: 019361a20f016 ("- pre6: Intel: start to add Pentium IV specific stuff (128-byte cacheline etc)...") introduced leaf 0x2 output parsing. It only validated the MSBs of EAX, EBX, and ECX, but left EDX unchecked. Validate EDX's most-significant bit. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250304085152.51092-2-darwi@linutronix.de --- arch/x86/kernel/cpu/cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index e6fa03ed9172..a6c6bccfa8b8 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -808,7 +808,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; -- cgit v1.2.3 From 1881148215c67151b146450fb89ec22fd92337a7 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:13 +0100 Subject: x86/cpu: Validate CPUID leaf 0x2 EDX output CPUID leaf 0x2 emits one-byte descriptors in its four output registers EAX, EBX, ECX, and EDX. For these descriptors to be valid, the most significant bit (MSB) of each register must be clear. Leaf 0x2 parsing at intel.c only validated the MSBs of EAX, EBX, and ECX, but left EDX unchecked. Validate EDX's most-significant bit as well. Fixes: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: stable@kernel.org Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250304085152.51092-3-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3dce22f00dc3..2a3716afee63 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -799,7 +799,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; -- cgit v1.2.3 From f6bdaab79ee4228a143ee1b4cb80416d6ffc0c63 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:14 +0100 Subject: x86/cpu: Properly parse CPUID leaf 0x2 TLB descriptor 0x63 CPUID leaf 0x2's one-byte TLB descriptors report the number of entries for specific TLB types, among other properties. Typically, each emitted descriptor implies the same number of entries for its respective TLB type(s). An emitted 0x63 descriptor is an exception: it implies 4 data TLB entries for 1GB pages and 32 data TLB entries for 2MB or 4MB pages. For the TLB descriptors parsing code, the entry count for 1GB pages is encoded at the intel_tlb_table[] mapping, but the 2MB/4MB entry count is totally ignored. Update leaf 0x2's parsing logic 0x2 to account for 32 data TLB entries for 2MB/4MB pages implied by the 0x63 descriptor. Fixes: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: stable@kernel.org Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250304085152.51092-4-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 50 ++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2a3716afee63..134368a3f4b1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -635,26 +635,37 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 -#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G_2M_4M 0x17 -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +/* + * All of leaf 0x2's one-byte TLB descriptors implies the same number of + * entries for their respective TLB types. The 0x63 descriptor is an + * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries + * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for + * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the + * intel_tlb_table[] mapping. + */ +#define TLB_0x63_2M_4M_ENTRIES 32 static const struct _tlb_table intel_tlb_table[] = { { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, @@ -676,7 +687,8 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative" + " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" }, { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, @@ -776,6 +788,12 @@ static void intel_tlb_lookup(const unsigned char desc) if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; break; + case TLB_DATA_1G_2M_4M: + if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) + tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) + tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + fallthrough; case TLB_DATA_1G: if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; -- cgit v1.2.3 From 091b768604a8df7822aade75dd5bfc5c788154ee Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 3 Mar 2025 10:37:59 +0100 Subject: xen: Kconfig: Drop reference to obsolete configs MCORE2 and MK8 Commit f388f60ca904 ("x86/cpu: Drop configuration options for early 64-bit CPUs") removes the config symbols MCORE2 and MK8. With that, the references to those two config symbols in xen's x86 Kconfig are obsolete. Drop them. Fixes: f388f60ca904 ("x86/cpu: Drop configuration options for early 64-bit CPUs") Signed-off-by: Lukas Bulwahn Signed-off-by: Ingo Molnar Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20250303093759.371445-1-lukas.bulwahn@redhat.com --- arch/x86/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 77e788e928cd..98d8a50d2aed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,7 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) - depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the -- cgit v1.2.3 From 4e32645cd8f97a308300623f81c902747df6b97b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 2 Mar 2025 16:48:51 -0800 Subject: x86/smp: Fix mwait_play_dead() and acpi_processor_ffh_play_dead() noreturn behavior Fix some related issues (done in a single patch to avoid introducing intermediate bisect warnings): 1) The SMP version of mwait_play_dead() doesn't return, but its !SMP counterpart does. Make its calling behavior consistent by resolving the !SMP version to a BUG(). It should never be called anyway, this just enforces that at runtime and enables its callers to be marked as __noreturn. 2) While the SMP definition of mwait_play_dead() is annotated as __noreturn, the declaration isn't. Nor is it listed in tools/objtool/noreturns.h. Fix that. 3) Similar to #1, the SMP version of acpi_processor_ffh_play_dead() doesn't return but its !SMP counterpart does. Make the !SMP version a BUG(). It should never be called. 4) acpi_processor_ffh_play_dead() doesn't return, but is lacking any __noreturn annotations. Fix that. This fixes the following objtool warnings: vmlinux.o: warning: objtool: acpi_processor_ffh_play_dead+0x67: mwait_play_dead() is missing a __noreturn annotation vmlinux.o: warning: objtool: acpi_idle_play_dead+0x3c: acpi_processor_ffh_play_dead() is missing a __noreturn annotation Fixes: a7dd183f0b38 ("x86/smp: Allow calling mwait_play_dead with an arbitrary hint") Fixes: 541ddf31e300 ("ACPI/processor_idle: Add FFH state handling") Reported-by: Paul E. McKenney Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/e885c6fa9e96a61471b33e48c2162d28b15b14c5.1740962711.git.jpoimboe@kernel.org --- arch/x86/include/asm/smp.h | 4 ++-- arch/x86/kernel/acpi/cstate.c | 2 +- include/acpi/processor.h | 6 +++--- tools/objtool/noreturns.h | 2 ++ 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 80f8bfd83fc7..1d3b11eba084 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -114,7 +114,7 @@ void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); -void mwait_play_dead(unsigned int eax_hint); +void __noreturn mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); @@ -166,7 +166,7 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) return (struct cpumask *)cpumask_of(0); } -static inline void mwait_play_dead(unsigned int eax_hint) { } +static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 86c87c01d23d..d25584255ab8 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -206,7 +206,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); struct cstate_entry *percpu_entry; diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 63a37e72b721..d0eccbd920e5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -280,7 +280,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct acpi_processor_cx *cx, struct acpi_power_register *reg); void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cstate); -void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx); #else static inline void acpi_processor_power_init_bm_check(struct acpi_processor_flags @@ -301,9 +301,9 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx { return; } -static inline void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +static inline void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) { - return; + BUG(); } #endif diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index b2174894f9f7..5a4aec4c4639 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -16,6 +16,7 @@ NORETURN(__tdx_hypercall_failed) NORETURN(__ubsan_handle_builtin_unreachable) NORETURN(__x64_sys_exit) NORETURN(__x64_sys_exit_group) +NORETURN(acpi_processor_ffh_play_dead) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) @@ -34,6 +35,7 @@ NORETURN(kunit_try_catch_throw) NORETURN(machine_real_restart) NORETURN(make_task_dead) NORETURN(mpt_halt_firmware) +NORETURN(mwait_play_dead) NORETURN(nmi_panic_self_stop) NORETURN(panic) NORETURN(panic_smp_self_stop) -- cgit v1.2.3 From cfceff8526a426948b53445c02bcb98453c7330d Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:43 -0800 Subject: x86/speculation: Simplify and make CALL_NOSPEC consistent CALL_NOSPEC macro is used to generate Spectre-v2 mitigation friendly indirect branches. At compile time the macro defaults to indirect branch, and at runtime those can be patched to thunk based mitigations. This approach is opposite of what is done for the rest of the kernel, where the compile time default is to replace indirect calls with retpoline thunk calls. Make CALL_NOSPEC consistent with the rest of the kernel, default to retpoline thunk at compile time when CONFIG_MITIGATION_RETPOLINE is enabled. Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-1-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e8bf78c03d5..1e6b915ce956 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -424,16 +424,11 @@ static inline void call_depth_return_thunk(void) {} * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#ifdef CONFIG_MITIGATION_RETPOLINE +#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#else +#define CALL_NOSPEC "call *%[thunk_target]\n" +#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) -- cgit v1.2.3 From 052040e34c08428a5a388b85787e8531970c0c67 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:58 -0800 Subject: x86/speculation: Add a conditional CS prefix to CALL_NOSPEC Retpoline mitigation for spectre-v2 uses thunks for indirect branches. To support this mitigation compilers add a CS prefix with -mindirect-branch-cs-prefix. For an indirect branch in asm, this needs to be added manually. CS prefix is already being added to indirect branches in asm files, but not in inline asm. Add CS prefix to CALL_NOSPEC for inline asm as well. There is no JMP_NOSPEC for inline asm. Reported-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-2-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1e6b915ce956..aee26bb8230f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -198,9 +198,8 @@ .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -420,12 +419,24 @@ static inline void call_depth_return_thunk(void) {} #ifdef CONFIG_X86_64 +/* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ #ifdef CONFIG_MITIGATION_RETPOLINE -#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" #else #define CALL_NOSPEC "call *%[thunk_target]\n" #endif -- cgit v1.2.3 From 27c3b452c1a554483ac692702639c826602d1089 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 3 Mar 2025 15:45:37 +0000 Subject: x86/cpu: Remove unnecessary macro indirection related to CPU feature names These macros used to abstract over CONFIG_X86_FEATURE_NAMES, but that was removed in: 7583e8fbdc49 ("x86/cpu: Remove X86_FEATURE_NAMES") Now they are just an unnecessary indirection, remove them. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250303-setcpuid-taint-louder-v1-1-8d255032cb4c@google.com --- arch/x86/include/asm/cpufeature.h | 5 ----- arch/x86/kernel/cpu/common.c | 12 ++++++------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e5fc0038c8f6..e955da397e0e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -37,13 +37,8 @@ enum cpuid_leafs NR_CPUID_WORDS, }; -#define X86_CAP_FMT_NUM "%d:%d" -#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) - extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; -#define X86_CAP_FMT "%s" -#define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f32b6ffbf04..b5fdaa6fd4c4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -667,8 +667,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } @@ -1502,9 +1502,9 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) - pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + pr_cont(" %d:%d", bit >> 5, bit & 31); else - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + pr_cont(" %s", x86_cap_flags[bit]); if (set) setup_force_cpu_cap(bit); @@ -1523,9 +1523,9 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) const char *flag; if (bit < 32 * NCAPINTS) - flag = x86_cap_flag(bit); + flag = x86_cap_flags[bit]; else - flag = x86_bug_flag(bit - (32 * NCAPINTS)); + flag = x86_bug_flags[bit - (32 * NCAPINTS)]; if (!flag) continue; -- cgit v1.2.3 From 681955761bf6845241c6d33e2fb222f5e92c8b89 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 3 Mar 2025 15:45:38 +0000 Subject: x86/cpu: Warn louder about the {set,clear}cpuid boot parameters Commit 814165e9fd1f6 ("x86/cpu: Add the 'setcpuid=' boot parameter") recently expanded the user's ability to break their system horribly by overriding effective CPU flags. This was reflected with updates to the documentation to try and make people aware that this is dangerous. To further reduce the risk of users mistaking this for a "real feature", and try to help them figure out why their kernel is tainted if they do use it: - Upgrade the existing printk to pr_warn, to help ensure kernel logs reflect what changes are in effect. - Print an extra warning that tries to be as dramatic as possible, while also highlighting the fact that it tainted the kernel. Suggested-by: Ingo Molnar Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250303-setcpuid-taint-louder-v1-2-8d255032cb4c@google.com --- arch/x86/kernel/cpu/common.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b5fdaa6fd4c4..c1ced31f976d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1479,12 +1479,12 @@ static void detect_nopl(void) #endif } -static inline void parse_set_clear_cpuid(char *arg, bool set) +static inline bool parse_set_clear_cpuid(char *arg, bool set) { char *opt; int taint = 0; - pr_info("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); + pr_warn("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); while (arg) { bool found __maybe_unused = false; @@ -1547,10 +1547,9 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) pr_cont(" (unknown: %s)", opt); } - if (taint) - add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); - pr_cont("\n"); + + return taint; } @@ -1560,6 +1559,7 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) */ static void __init cpu_parse_early_param(void) { + bool cpuid_taint = false; char arg[128]; int arglen; @@ -1594,11 +1594,16 @@ static void __init cpu_parse_early_param(void) arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen > 0) - parse_set_clear_cpuid(arg, false); + cpuid_taint |= parse_set_clear_cpuid(arg, false); arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); if (arglen > 0) - parse_set_clear_cpuid(arg, true); + cpuid_taint |= parse_set_clear_cpuid(arg, true); + + if (cpuid_taint) { + pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } /* -- cgit v1.2.3 From d0ba9bcf001c7907e4755b0e498f5ff9d1a228ef Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 3 Mar 2025 15:45:39 +0000 Subject: x86/cpu: Log CPU flag cmdline hacks more verbosely Since using these options is very dangerous, make details as visible as possible: - Instead of a single message for each of the cmdline options, print a separate pr_warn() for each individual flag. - Say explicitly whether the flag is a "feature" or a "bug". Suggested-by: Peter Zijlstra Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250303-setcpuid-taint-louder-v1-3-8d255032cb4c@google.com --- arch/x86/kernel/cpu/common.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1ced31f976d..8eba9ca9c216 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1484,8 +1484,6 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) char *opt; int taint = 0; - pr_warn("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); - while (arg) { bool found __maybe_unused = false; unsigned int bit; @@ -1500,16 +1498,19 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { + if (set) { + pr_warn("setcpuid: force-enabling CPU feature flag:"); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU feature flag:"); + setup_clear_cpu_cap(bit); + } /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) - pr_cont(" %d:%d", bit >> 5, bit & 31); + pr_cont(" %d:%d\n", bit >> 5, bit & 31); else - pr_cont(" %s", x86_cap_flags[bit]); + pr_cont(" %s\n", x86_cap_flags[bit]); - if (set) - setup_force_cpu_cap(bit); - else - setup_clear_cpu_cap(bit); taint++; } /* @@ -1521,11 +1522,15 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { const char *flag; + const char *kind; - if (bit < 32 * NCAPINTS) + if (bit < 32 * NCAPINTS) { flag = x86_cap_flags[bit]; - else + kind = "feature"; + } else { + kind = "bug"; flag = x86_bug_flags[bit - (32 * NCAPINTS)]; + } if (!flag) continue; @@ -1533,22 +1538,24 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) if (strcmp(flag, opt)) continue; - pr_cont(" %s", opt); - if (set) + if (set) { + pr_warn("setcpuid: force-enabling CPU %s flag: %s\n", + kind, flag); setup_force_cpu_cap(bit); - else + } else { + pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n", + kind, flag); setup_clear_cpu_cap(bit); + } taint++; found = true; break; } if (!found) - pr_cont(" (unknown: %s)", opt); + pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt); } - pr_cont("\n"); - return taint; } -- cgit v1.2.3 From 97c7d5723537de08e076892e07d6089ae9777965 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:15 +0100 Subject: x86/cpuid: Include in uses static_assert() at multiple locations but it does not include the CPP macro's definition at linux/build_bug.h. Include the needed header to make self-sufficient. This gets triggered when cpuid.h is included in new C files, which is to be done in further commits. Fixes: 43d86e3cd9a7 ("x86/cpu: Provide cpuid_read() et al.") Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-5-darwi@linutronix.de --- arch/x86/include/asm/cpuid.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index b2b9b4ef3dae..a92e4b08820a 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -6,6 +6,7 @@ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include #include #include -- cgit v1.2.3 From dec7fdc0b79c2ae0a537343b17f5ba1c6c47e1ca Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:16 +0100 Subject: x86/cpu: Remove unnecessary headers and reorder the rest Remove the headers at intel.c that are no longer required. Alphabetically reorder what remains since more headers will be included in further commits. Suggested-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-6-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c5d833f5bffb..60b58b1a0c69 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,40 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include #include -#include -#include -#include -#include #include -#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif -#include -#include #include +#include +#include #include +#include #include #include -#include -#include -#include -#include +#include #include +#include #include - -#ifdef CONFIG_X86_64 -#include -#endif +#include #include "cpu.h" -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#endif - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists -- cgit v1.2.3 From cb5f4c76b2a9314c35e00c67c98ccd03542c2634 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:17 +0100 Subject: x86/cpu: Use max() for CPUID leaf 0x2 TLB descriptors parsing The conditional statement "if (x < y) { x = y; }" appears 22 times at the Intel leaf 0x2 descriptors parsing logic. Replace each of such instances with a max() expression to simplify the code. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-7-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 76 +++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 60b58b1a0c69..42a57b85f93b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -700,7 +701,9 @@ static const struct _tlb_table intel_tlb_table[] = { static void intel_tlb_lookup(const unsigned char desc) { + unsigned int entries; unsigned char k; + if (desc == 0) return; @@ -712,81 +715,58 @@ static void intel_tlb_lookup(const unsigned char desc) if (intel_tlb_table[k].tlb_type == 0) return; + entries = intel_tlb_table[k].entries; switch (intel_tlb_table[k].tlb_type) { case STLB_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); break; case STLB_4K_2M: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); + tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_INST_ALL: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); break; case TLB_INST_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); break; case TLB_INST_4M: - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); break; case TLB_INST_2M_4M: - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_DATA_4K_4M: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_DATA_1G_2M_4M: - if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) - tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; - if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) - tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); fallthrough; case TLB_DATA_1G: - if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_1g[ENTRIES] = max(tlb_lld_1g[ENTRIES], entries); break; } } -- cgit v1.2.3 From 8b7e54b542103753619a37cbb3216849a934872f Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:18 +0100 Subject: x86/cpu: Simplify TLB entry count storage Commit: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") introduced u16 "info" arrays for each TLB type. Since 2012 and each array stores just one type of information: the number of TLB entries for its respective TLB type. Replace such arrays with simple variables. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-8-darwi@linutronix.de --- arch/x86/include/asm/processor.h | 19 ++++++---------- arch/x86/kernel/cpu/amd.c | 18 +++++++-------- arch/x86/kernel/cpu/common.c | 20 ++++++++--------- arch/x86/kernel/cpu/hygon.c | 16 +++++++------- arch/x86/kernel/cpu/intel.c | 48 ++++++++++++++++++++-------------------- 5 files changed, 57 insertions(+), 64 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0cd10182e90..0ea227fa027c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -60,18 +60,13 @@ struct vm86; # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -enum tlb_infos { - ENTRIES, - NR_INFO -}; - -extern u16 __read_mostly tlb_lli_4k[NR_INFO]; -extern u16 __read_mostly tlb_lli_2m[NR_INFO]; -extern u16 __read_mostly tlb_lli_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4k[NR_INFO]; -extern u16 __read_mostly tlb_lld_2m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_1g[NR_INFO]; +extern u16 __read_mostly tlb_lli_4k; +extern u16 __read_mostly tlb_lli_2m; +extern u16 __read_mostly tlb_lli_4m; +extern u16 __read_mostly tlb_lld_4k; +extern u16 __read_mostly tlb_lld_2m; +extern u16 __read_mostly tlb_lld_4m; +extern u16 __read_mostly tlb_lld_1g; /* * CPU type and hardware bug flags. Kept separately for each CPU. diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d747515ad013..315766440201 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1105,8 +1105,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB @@ -1119,26 +1119,26 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { /* Erratum 658 */ if (c->x86 == 0x15 && c->x86_model <= 0x1f) { - tlb_lli_2m[ENTRIES] = 1024; + tlb_lli_2m = 1024; } else { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev amd_cpu_dev = { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8eba9ca9c216..3a1a957e0c60 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -846,13 +846,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -u16 __read_mostly tlb_lli_4k[NR_INFO]; -u16 __read_mostly tlb_lli_2m[NR_INFO]; -u16 __read_mostly tlb_lli_4m[NR_INFO]; -u16 __read_mostly tlb_lld_4k[NR_INFO]; -u16 __read_mostly tlb_lld_2m[NR_INFO]; -u16 __read_mostly tlb_lld_4m[NR_INFO]; -u16 __read_mostly tlb_lld_1g[NR_INFO]; +u16 __read_mostly tlb_lli_4k; +u16 __read_mostly tlb_lli_2m; +u16 __read_mostly tlb_lli_4m; +u16 __read_mostly tlb_lld_4k; +u16 __read_mostly tlb_lld_2m; +u16 __read_mostly tlb_lld_4m; +u16 __read_mostly tlb_lld_1g; static void cpu_detect_tlb(struct cpuinfo_x86 *c) { @@ -860,12 +860,10 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) this_cpu->c_detect_tlb(c); pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", - tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES]); + tlb_lli_4k, tlb_lli_2m, tlb_lli_4m); pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", - tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], - tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); + tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g); } void get_cpu_vendor(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index c5191b06f9f2..6af4a4a90a52 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -240,26 +240,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev hygon_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 42a57b85f93b..61d3fd31baee 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -718,55 +718,55 @@ static void intel_tlb_lookup(const unsigned char desc) entries = intel_tlb_table[k].entries; switch (intel_tlb_table[k].tlb_type) { case STLB_4K: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case STLB_4K_2M: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); - tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); - tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_INST_ALL: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); - tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_4K: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); break; case TLB_INST_4M: - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_2M_4M: - tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_4K_4M: - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_1G_2M_4M: - tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); + tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES); fallthrough; case TLB_DATA_1G: - tlb_lld_1g[ENTRIES] = max(tlb_lld_1g[ENTRIES], entries); + tlb_lld_1g = max(tlb_lld_1g, entries); break; } } -- cgit v1.2.3 From 535d9a82702ee75b0da6e4547f367beeeef184a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Mar 2025 09:51:19 +0100 Subject: x86/cpu: Get rid of the smp_store_cpu_info() indirection smp_store_cpu_info() is just a wrapper around identify_secondary_cpu() without further value. Move the extra bits from smp_store_cpu_info() into identify_secondary_cpu() and remove the wrapper. [ darwi: Make it compile and fix up the xen/smp_pv.c instance ] Signed-off-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-9-darwi@linutronix.de --- arch/x86/include/asm/processor.h | 2 +- arch/x86/include/asm/smp.h | 2 -- arch/x86/kernel/cpu/common.c | 11 +++++++++-- arch/x86/kernel/smpboot.c | 24 ++---------------------- arch/x86/xen/smp_pv.c | 2 +- 5 files changed, 13 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0ea227fa027c..d5d9a071cddc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -229,7 +229,7 @@ static inline unsigned long long l1tf_pfn_limit(void) void init_cpu_devs(void); void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); -extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void identify_secondary_cpu(unsigned int cpu); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1d3b11eba084..128e06a18e51 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -120,8 +120,6 @@ void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void smp_store_cpu_info(int id); - asmlinkage __visible void smp_reboot_interrupt(void); __visible void smp_reschedule_interrupt(struct pt_regs *regs); __visible void smp_call_function_interrupt(struct pt_regs *regs); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3a1a957e0c60..5f81c553e733 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1997,9 +1997,15 @@ static __init void identify_boot_cpu(void) lkgs_init(); } -void identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(unsigned int cpu) { - BUG_ON(c == &boot_cpu_data); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = cpu; + identify_cpu(c); #ifdef CONFIG_X86_32 enable_sep_cpu(); @@ -2010,6 +2016,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_gds_msr(); tsx_ap_init(); + c->initialized = true; } void print_cpu_info(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5746084bafe4..8ecf1bf57103 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -190,7 +190,7 @@ static void ap_starting(void) apic_ap_setup(); /* Save the processor parameters. */ - smp_store_cpu_info(cpuid); + identify_secondary_cpu(cpuid); /* * The topology information must be up to date before @@ -215,7 +215,7 @@ static void ap_calibrate_delay(void) { /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. - * smp_store_cpu_info() stored a value that is close but not as + * identify_secondary_cpu() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, @@ -315,26 +315,6 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ -void smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = &cpu_data(id); - - /* Copy boot_cpu_data only on the first bringup */ - if (!c->initialized) - *c = boot_cpu_data; - c->cpu_index = id; - /* - * During boot time, CPU0 has this setup already. Save the info when - * bringing up an AP. - */ - identify_secondary_cpu(c); - c->initialized = true; -} - static bool topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6863d3da7dec..688ff59318ae 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -70,7 +70,7 @@ static void cpu_bringup(void) xen_enable_syscall(); } cpu = smp_processor_id(); - smp_store_cpu_info(cpu); + identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); -- cgit v1.2.3 From 1f61dfdf16cd3bab383741c2eb43e7f69e9f592f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Mar 2025 09:51:20 +0100 Subject: x86/cpu: Remove unused TLB strings Commit: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") added the TLB table for parsing CPUID(0x4), including strings describing them. The string entry in the table was never used. Convert them to comments. Signed-off-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-10-darwi@linutronix.de --- arch/x86/kernel/cpu/cpu.h | 8 ----- arch/x86/kernel/cpu/intel.c | 80 ++++++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1beccefbaff9..51deb60a9d26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,14 +33,6 @@ struct cpu_dev { #endif }; -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; - /* unsigned int ways; */ - char info[128]; -}; - #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __section(".x86_cpu_dev.init") = \ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 61d3fd31baee..291c82816797 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -658,44 +658,50 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) */ #define TLB_0x63_2M_4M_ENTRIES 32 +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; +}; + static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, - { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, - { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, - { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, - { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, - { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, - { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative" - " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" }, - { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, - { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, - { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, - { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, - { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, - { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, - { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, - { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ + { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ + { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ + { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ + { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ + { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ + { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ + { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ + { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ + { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ + { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ { 0x00, 0, 0 } }; -- cgit v1.2.3 From b3a756bd72ec8d1ba43334b17115e0ece1144a88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Mar 2025 09:51:22 +0100 Subject: x86/cacheinfo: Remove the P4 trace leftovers for real Commit 851026a2bf54 ("x86/cacheinfo: Remove unused trace variable") removed the switch case for LVL_TRACE but did not get rid of the surrounding gunk. Signed-off-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-12-darwi@linutronix.de --- arch/x86/kernel/cpu/cacheinfo.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index a6c6bccfa8b8..eccffe2ea06c 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -31,7 +31,6 @@ #define LVL_1_DATA 2 #define LVL_2 3 #define LVL_3 4 -#define LVL_TRACE 5 /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -96,10 +95,6 @@ static const struct _cache_table cache_table[] = { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -787,19 +782,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) } } } - /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache - */ - if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) { + + /* Don't use CPUID(2) if CPUID(4) is supported. */ + if (!ci->num_leaves && c->cpuid_level > 1) { /* supports eax=2 call */ int j, n; unsigned int regs[4]; unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (ci->num_leaves && c->x86 == 15) - only_trace = 1; /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; @@ -820,8 +809,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) /* look up this descriptor in the table */ while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; switch (cache_table[k].cache_type) { case LVL_1_INST: l1i += cache_table[k].size; -- cgit v1.2.3 From 6309ff98f00bad118812f7f250fbbee4867e88d3 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:23 +0100 Subject: x86/cacheinfo: Remove unnecessary headers and reorder the rest Remove the headers at cacheinfo.c that are no longer required. Alphabetically reorder what remains since more headers will be included in further commits. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-13-darwi@linutronix.de --- arch/x86/kernel/cpu/cacheinfo.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index eccffe2ea06c..b3a520959b51 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -8,21 +8,19 @@ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ -#include #include +#include #include #include -#include -#include -#include #include #include +#include -#include -#include #include -#include +#include +#include #include +#include #include #include "cpu.h" -- cgit v1.2.3 From 4f2a0b765c9731d2fa94e209ee9ae0e96b280f17 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:41 +0100 Subject: : Cover all possible x86 CPU cache sizes Add size macros for 24/192/384 Kilobytes and 3/6/12/18/24 Megabytes. With that, the x86 subsystem can avoid locally defining its own macros for CPU cache sizes. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20250304085152.51092-31-darwi@linutronix.de --- include/linux/sizes.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/sizes.h b/include/linux/sizes.h index c3a00b967d18..49039494076f 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -23,17 +23,25 @@ #define SZ_4K 0x00001000 #define SZ_8K 0x00002000 #define SZ_16K 0x00004000 +#define SZ_24K 0x00006000 #define SZ_32K 0x00008000 #define SZ_64K 0x00010000 #define SZ_128K 0x00020000 +#define SZ_192K 0x00030000 #define SZ_256K 0x00040000 +#define SZ_384K 0x00060000 #define SZ_512K 0x00080000 #define SZ_1M 0x00100000 #define SZ_2M 0x00200000 +#define SZ_3M 0x00300000 #define SZ_4M 0x00400000 +#define SZ_6M 0x00600000 #define SZ_8M 0x00800000 +#define SZ_12M 0x00c00000 #define SZ_16M 0x01000000 +#define SZ_18M 0x01200000 +#define SZ_24M 0x01800000 #define SZ_32M 0x02000000 #define SZ_64M 0x04000000 #define SZ_128M 0x08000000 -- cgit v1.2.3 From e1c49eaee52384ec9e3734138b3929a35b64e0c3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 2 Mar 2025 17:20:59 -0800 Subject: KVM: VMX: Use named operands in inline asm Convert the non-asm-goto version of the inline asm in __vmcs_readl() to use named operands, similar to its asm-goto version. Do this in preparation of changing the ASM_CALL_CONSTRAINT primitive. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: Sean Christopherson Cc: linux-kernel@vger.kernel.org --- arch/x86/kvm/vmx/vmx_ops.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 633c87e2fd92..96677576c836 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -118,7 +118,7 @@ do_exception: #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ do_exception: * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ -- cgit v1.2.3 From 9064a8e556fa70ccfd9c414350406ed4887d3059 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 2 Mar 2025 17:21:00 -0800 Subject: x86/hyperv: Use named operands in inline asm Use named operands in inline asm to make it easier to change the constraint order. Do this in preparation of changing the ASM_CALL_CONSTRAINT primitive. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Cc: linux-kernel@vger.kernel.org --- arch/x86/include/asm/mshyperv.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f91ab1e75f9f..5e6193dbc97e 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -77,11 +77,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return hv_tdx_hypercall(control, input_address, output_address); if (hv_isolation_type_snp() && !hyperv_paravisor_present) { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" "vmmcall" : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address) + : [output_address] "r" (output_address) : "cc", "memory", "r8", "r9", "r10", "r11"); return hv_status; } @@ -89,12 +89,12 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), - THUNK_TARGET(hv_hypercall_pg) + : [output_address] "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -187,18 +187,18 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) return hv_tdx_hypercall(control, input1, input2); if (hv_isolation_type_snp() && !hyperv_paravisor_present) { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[input2], %%r8\n" "vmmcall" : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2) + : [input2] "r" (input2) : "cc", "r8", "r9", "r10", "r11"); } else { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[input2], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2), + : [input2] "r" (input2), THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } -- cgit v1.2.3 From 224788b63a2e426b6b82c76456a068a2ab87610f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 2 Mar 2025 17:21:01 -0800 Subject: x86/alternatives: Simplify alternative_call() interface Separate the input from the clobbers in preparation for appending the input. Do this in preparation of changing the ASM_CALL_CONSTRAINT primitive. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: linux-kernel@vger.kernel.org --- arch/x86/include/asm/alternative.h | 24 ++++------ arch/x86/include/asm/apic.h | 4 +- arch/x86/include/asm/asm.h | 11 +++++ arch/x86/include/asm/atomic64_32.h | 96 +++++++++++++++++++++++--------------- arch/x86/include/asm/page_64.h | 4 +- 5 files changed, 82 insertions(+), 57 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index a2141665239b..52626a7251e6 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -237,10 +237,12 @@ static inline int alternatives_text_reserved(void *start, void *end) * references: i.e., if used for a function, it would add the PLT * suffix. */ -#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \ +#define alternative_call(oldfunc, newfunc, ft_flags, output, input, clobbers...) \ asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \ : ALT_OUTPUT_SP(output) \ - : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + : [old] "i" (oldfunc), [new] "i" (newfunc) \ + COMMA(input) \ + : clobbers) /* * Like alternative_call, but there are two features and respective functions. @@ -249,24 +251,14 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, old function is used. */ #define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ - output, input...) \ + output, input, clobbers...) \ asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \ "call %c[new2]", ft_flags2) \ : ALT_OUTPUT_SP(output) \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) - -/* - * use this macro(s) if you need more than one output parameter - * in alternative_io - */ -#define ASM_OUTPUT2(a...) a - -/* - * use this macro if you need clobbers but no inputs in - * alternative_{input,io,call}() - */ -#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr + [new2] "i" (newfunc2) \ + COMMA(input) \ + : clobbers) #define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f21ff1932699..c903d358405d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -99,8 +99,8 @@ static inline void native_apic_mem_write(u32 reg, u32 v) volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, - ASM_OUTPUT2("=r" (v), "=m" (*addr)), - ASM_OUTPUT2("0" (v), "m" (*addr))); + ASM_OUTPUT("=r" (v), "=m" (*addr)), + ASM_INPUT("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 2bec0c89a95c..975ae7a9397e 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -213,6 +213,17 @@ static __always_inline __pure void *rip_rel_ptr(void *p) /* For C file, we already have NOKPROBE_SYMBOL macro */ +/* Insert a comma if args are non-empty */ +#define COMMA(x...) __COMMA(x) +#define __COMMA(...) , ##__VA_ARGS__ + +/* + * Combine multiple asm inline constraint args into a single arg for passing to + * another macro. + */ +#define ASM_OUTPUT(x...) x +#define ASM_INPUT(x...) x + /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 797085ecaaa4..ab838205c1c6 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -49,16 +49,19 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) #endif #ifdef CONFIG_X86_CX8 -#define __alternative_atomic64(f, g, out, in...) \ - asm volatile("call %c[func]" \ +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + asm volatile("call %c[func]" \ : ALT_OUTPUT_SP(out) \ - : [func] "i" (atomic64_##g##_cx8), ## in) + : [func] "i" (atomic64_##g##_cx8) \ + COMMA(in) \ + : clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else -#define __alternative_atomic64(f, g, out, in...) \ - alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ - X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in) +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ + X86_FEATURE_CX8, ASM_OUTPUT(out), \ + ASM_INPUT(in), clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \ ATOMIC64_DECL_ONE(sym##_386) @@ -69,8 +72,8 @@ ATOMIC64_DECL_ONE(inc_386); ATOMIC64_DECL_ONE(dec_386); #endif -#define alternative_atomic64(f, out, in...) \ - __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in) +#define alternative_atomic64(f, out, in, clobbers...) \ + __alternative_atomic64(f, f, ASM_OUTPUT(out), ASM_INPUT(in), clobbers) ATOMIC64_DECL(read); ATOMIC64_DECL(set); @@ -105,9 +108,10 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; - alternative_atomic64(xchg, "=&A" (o), - "S" (v), "b" (low), "c" (high) - : "memory"); + alternative_atomic64(xchg, + "=&A" (o), + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "memory"); return o; } #define arch_atomic64_xchg arch_atomic64_xchg @@ -116,23 +120,25 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; - alternative_atomic64(set, /* no output */, - "S" (v), "b" (low), "c" (high) - : "eax", "edx", "memory"); + alternative_atomic64(set, + /* no output */, + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "eax", "edx", "memory"); } static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; - alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); + alternative_atomic64(read, "=&A" (r), "c" (v), "memory"); return r; } static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_add_return arch_atomic64_add_return @@ -140,8 +146,9 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_sub_return arch_atomic64_sub_return @@ -149,8 +156,10 @@ static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 a; - alternative_atomic64(inc_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(inc_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return @@ -158,8 +167,10 @@ static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 a; - alternative_atomic64(dec_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(dec_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_dec_return arch_atomic64_dec_return @@ -167,28 +178,34 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } static __always_inline void arch_atomic64_inc(atomic64_t *v) { - __alternative_atomic64(inc, inc_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(inc, inc_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_dec(atomic64_t *v) { - __alternative_atomic64(dec, dec_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(dec, dec_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_dec arch_atomic64_dec @@ -197,8 +214,9 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); alternative_atomic64(add_unless, - ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)), - "S" (v) : "memory"); + ASM_OUTPUT("+A" (a), "+c" (low), "+D" (high)), + "S" (v), + "memory"); return (int)a; } #define arch_atomic64_add_unless arch_atomic64_add_unless @@ -206,8 +224,10 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; - alternative_atomic64(inc_not_zero, "=&a" (r), - "S" (v) : "ecx", "edx", "memory"); + alternative_atomic64(inc_not_zero, + "=&a" (r), + "S" (v), + "ecx", "edx", "memory"); return r; } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero @@ -215,8 +235,10 @@ static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 r; - alternative_atomic64(dec_if_positive, "=&A" (r), - "S" (v) : "ecx", "memory"); + alternative_atomic64(dec_if_positive, + "=&A" (r), + "S" (v), + "ecx", "memory"); return r; } #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index d63576608ce7..d081e8000f34 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -55,8 +55,8 @@ static inline void clear_page(void *page) clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, "=D" (page), - "D" (page) - : "cc", "memory", "rax", "rcx"); + "D" (page), + "cc", "memory", "rax", "rcx"); } void copy_page(void *to, void *from); -- cgit v1.2.3 From 0ec914707c3ed052ed26eb88f9300109030a7fb2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 3 Mar 2025 16:54:21 +0100 Subject: x86/irq/32: Use named operands in inline asm Also use inout "+" constraint modifier where appropriate. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250303155446.112769-1-ubizjak@gmail.com --- arch/x86/kernel/irq_32.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index dc1049c01f9b..c4719c40252f 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -54,12 +54,11 @@ static inline void print_stack_overflow(void) { } static void call_on_stack(void *func, void *stack) { - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=b" (stack) - : "0" (stack), - [thunk_target] "D"(func) + "movl %[sp], %%esp" + : [sp] "+b" (stack) + : [thunk_target] "D" (func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -71,7 +70,7 @@ static inline void *current_stack(void) static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) { struct irq_stack *curstk, *irqstk; - u32 *isp, *prev_esp, arg1; + u32 *isp, *prev_esp; curstk = (struct irq_stack *) current_stack(); irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); @@ -94,12 +93,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=a" (arg1), "=b" (isp) - : "0" (desc), "1" (isp), - [thunk_target] "D" (desc->handle_irq) + "movl %[sp], %%esp" + : "+a" (desc), [sp] "+b" (isp) + : [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } -- cgit v1.2.3 From 76f71137811a6dfa52b3e22a86a772e5753021d3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 3 Mar 2025 16:54:22 +0100 Subject: x86/irq/32: Add missing clobber to inline asm i386 ABI declares %edx as a call-clobbered register. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250303155446.112769-2-ubizjak@gmail.com --- arch/x86/kernel/irq_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index c4719c40252f..eab458009f97 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -98,7 +98,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) "movl %[sp], %%esp" : "+a" (desc), [sp] "+b" (isp) : [thunk_target] "D" (desc->handle_irq) - : "memory", "cc", "ecx"); + : "memory", "cc", "edx", "ecx"); return 1; } -- cgit v1.2.3 From d4432fb5b8798a7663974bed277a8a6e330a50d8 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 3 Mar 2025 16:54:24 +0100 Subject: x86/irq/32: Use current_stack_pointer to avoid asm() in check_stack_overflow() Make code more readable by using the 'current_stack_pointer' global variable. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250303155446.112769-4-ubizjak@gmail.com --- arch/x86/kernel/irq_32.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index eab458009f97..8c7babbcf6b7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -31,10 +31,7 @@ int sysctl_panic_on_stackoverflow __read_mostly; /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) { - long sp; - - __asm__ __volatile__("andl %%esp,%0" : - "=r" (sp) : "0" (THREAD_SIZE - 1)); + unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1); return sp < (sizeof(struct thread_info) + STACK_WARN); } -- cgit v1.2.3 From c8b584fe82d0f1e478a598f954943b095a4a8f5c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 3 Mar 2025 16:54:25 +0100 Subject: x86/irq/32: Change some static functions to bool The return values of these functions is 0/1, but they use an int type instead of bool: check_stack_overflow() execute_on_irq_stack() Change the type of these function to bool and adjust their return values and affected helper variables. [ mingo: Rewrote the changelog ] Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250303155446.112769-5-ubizjak@gmail.com --- arch/x86/kernel/irq_32.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 8c7babbcf6b7..d301208d35d0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -29,7 +29,7 @@ int sysctl_panic_on_stackoverflow __read_mostly; /* Debugging check for stack overflow: is there less than 1KB free? */ -static int check_stack_overflow(void) +static bool check_stack_overflow(void) { unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1); @@ -45,7 +45,7 @@ static void print_stack_overflow(void) } #else -static inline int check_stack_overflow(void) { return 0; } +static inline bool check_stack_overflow(void) { return false; } static inline void print_stack_overflow(void) { } #endif @@ -64,7 +64,7 @@ static inline void *current_stack(void) return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } -static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc) { struct irq_stack *curstk, *irqstk; u32 *isp, *prev_esp; @@ -79,7 +79,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) * current stack (which is the irq stack already after all) */ if (unlikely(curstk == irqstk)) - return 0; + return false; isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -96,7 +96,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) : "+a" (desc), [sp] "+b" (isp) : [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "edx", "ecx"); - return 1; + return true; } /* @@ -145,7 +145,7 @@ void do_softirq_own_stack(void) void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - int overflow = check_stack_overflow(); + bool overflow = check_stack_overflow(); if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) -- cgit v1.2.3