From e2b43fb25243d502ad36b07bab9de09f4b76fff9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 1 Dec 2025 17:50:48 -0800 Subject: KVM: x86: Apply runtime updates to current CPUID during KVM_SET_CPUID{,2} When handling KVM_SET_CPUID{,2}, do runtime CPUID updates on the vCPU's current CPUID (and caps) prior to swapping in the incoming CPUID state so that KVM doesn't lose pending updates if the incoming CPUID is rejected, and to prevent a false failure on the equality check. Note, runtime updates are unconditionally performed on the incoming/new CPUID (and associated caps), i.e. clearing the dirty flag won't negatively affect the new CPUID. Fixes: 93da6af3ae56 ("KVM: x86: Defer runtime updates of dynamic CPUID bits until CPUID emulation") Reported-by: Igor Mammedov Closes: https://lore.kernel.org/all/20251128123202.68424a95@imammedo Cc: stable@vger.kernel.org Acked-by: Igor Mammedov Tested-by: Igor Mammedov Link: https://patch.msgid.link/20251202015049.1167490-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52524e0ca97f..913ffb995279 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -509,11 +509,18 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, u32 vcpu_caps[NR_KVM_CPU_CAPS]; int r; + /* + * Apply pending runtime CPUID updates to the current CPUID entries to + * avoid false positives due to mismatches on KVM-owned feature flags. + */ + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + /* * Swap the existing (old) entries with the incoming (new) entries in * order to massage the new entries, e.g. to account for dynamic bits - * that KVM controls, without clobbering the current guest CPUID, which - * KVM needs to preserve in order to unwind on failure. + * that KVM controls, without losing the current guest CPUID, which KVM + * needs to preserve in order to unwind on failure. * * Similarly, save the vCPU's current cpu_caps so that the capabilities * can be updated alongside the CPUID entries when performing runtime -- cgit v1.2.3 From da01f64e7470988f8607776aa7afa924208863fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:56:13 -0800 Subject: KVM: nSVM: Clear exit_code_hi in VMCB when synthesizing nested VM-Exits Explicitly clear exit_code_hi in the VMCB when synthesizing "normal" nested VM-Exits, as the full exit code is a 64-bit value (spoiler alert), and all exit codes for non-failing VMRUN use only bits 31:0. Cc: Jim Mattson Cc: Yosry Ahmed Cc: stable@vger.kernel.org Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251113225621.1688428-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/svm/svm.h | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9d29b2e7e855..eeeb4ae4c617 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2435,6 +2435,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -4611,6 +4612,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; + vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index dd78e6402345..e66a16e59b1a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -764,9 +764,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { - svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; + svm->vmcb->control.exit_code = exit_code; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); } -- cgit v1.2.3 From f402ecd7a8b6446547076f4bd24bd5d4dcc94481 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:56:14 -0800 Subject: KVM: nSVM: Set exit_code_hi to -1 when synthesizing SVM_EXIT_ERR (failed VMRUN) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set exit_code_hi to -1u as a temporary band-aid to fix a long-standing (effectively since KVM's inception) bug where KVM treats the exit code as a 32-bit value, when in reality it's a 64-bit value. Per the APM, offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE And a sane reading of the error values defined in "Table C-1. SVM Intercept Codes" is that negative values use the full 64 bits: –1 VMEXIT_INVALID Invalid guest state in VMCB. –2 VMEXIT_BUSYBUSY bit was set in the VMSA –3 VMEXIT_IDLE_REQUIREDThe sibling thread is not in an idle state -4 VMEXIT_INVALID_PMC Invalid PMC state And that interpretation is confirmed by testing on Milan and Turin (by setting bits in CR0[63:32] to generate VMEXIT_INVALID on VMRUN). Furthermore, Xen has treated exitcode as a 64-bit value since HVM support was adding in 2006 (see Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.")). Cc: Jim Mattson Cc: Yosry Ahmed Cc: stable@vger.kernel.org Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251113225621.1688428-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index da6e80b3ac35..143a0ef02b03 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -983,7 +983,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1016,7 +1016,7 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; -- cgit v1.2.3 From b2849bec936be642b5420801f902337f2507648e Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 5 Dec 2025 15:19:04 -0800 Subject: KVM: VMX: Update SVI during runtime APICv activation The APICv (apic->apicv_active) can be activated or deactivated at runtime, for instance, because of APICv inhibit reasons. Intel VMX employs different mechanisms to virtualize LAPIC based on whether APICv is active. When APICv is activated at runtime, GUEST_INTR_STATUS is used to configure and report the current pending IRR and ISR states. Unless a specific vector is explicitly included in EOI_EXIT_BITMAP, its EOI will not be trapped to KVM. Intel VMX automatically clears the corresponding ISR bit based on the GUEST_INTR_STATUS.SVI field. When APICv is deactivated at runtime, the VM_ENTRY_INTR_INFO_FIELD is used to specify the next interrupt vector to invoke upon VM-entry. The VMX IDT_VECTORING_INFO_FIELD is used to report un-invoked vectors on VM-exit. EOIs are always trapped to KVM, so the software can manually clear pending ISR bits. There are scenarios where, with APICv activated at runtime, a guest-issued EOI may not be able to clear the pending ISR bit. Taking vector 236 as an example, here is one scenario. 1. Suppose APICv is inactive. Vector 236 is pending in the IRR. 2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). 3. After VM-entry, vector 236 is invoked through the guest IDT. At this point, the data in VM_ENTRY_INTR_INFO_FIELD is no longer valid. The guest interrupt handler for vector 236 is invoked. 4. Suppose a VM exit occurs very early in the guest interrupt handler, before the EOI is issued. 5. Nothing is reported through the IDT_VECTORING_INFO_FIELD because vector 236 has already been invoked in the guest. 6. Now, suppose APICv is activated. Before the next VM-entry, KVM calls kvm_vcpu_update_apicv() to activate APICv. 7. Unfortunately, GUEST_INTR_STATUS.SVI is not configured, although vector 236 is still pending in the ISR. 8. After VM-entry, the guest finally issues the EOI for vector 236. However, because SVI is not configured, vector 236 is not cleared. 9. ISR is stalled forever on vector 236. Here is another scenario. 1. Suppose APICv is inactive. Vector 236 is pending in the IRR. 2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). 3. VM-exit occurs immediately after the next VM-entry. The vector 236 is not invoked through the guest IDT. Instead, it is saved to the IDT_VECTORING_INFO_FIELD during the VM-exit. 4. KVM calls kvm_queue_interrupt() to re-queue the un-invoked vector 236 into vcpu->arch.interrupt. A KVM_REQ_EVENT is requested. 5. Now, suppose APICv is activated. Before the next VM-entry, KVM calls kvm_vcpu_update_apicv() to activate APICv. 6. Although APICv is now active, KVM still uses the legacy VM_ENTRY_INTR_INFO_FIELD to re-inject vector 236. GUEST_INTR_STATUS.SVI is not configured. 7. After the next VM-entry, vector 236 is invoked through the guest IDT. Finally, an EOI occurs. However, due to the lack of GUEST_INTR_STATUS.SVI configuration, vector 236 is not cleared from the ISR. 8. ISR is stalled forever on vector 236. Using QEMU as an example, vector 236 is stuck in ISR forever. (qemu) info lapic 1 dumping local APIC state for CPU 1 LVT0 0x00010700 active-hi edge masked ExtINT (vec 0) LVT1 0x00010400 active-hi edge masked NMI LVTPC 0x00000400 active-hi edge NMI LVTERR 0x000000fe active-hi edge Fixed (vec 254) LVTTHMR 0x00010000 active-hi edge masked Fixed (vec 0) LVTT 0x000400ec active-hi edge tsc-deadline Fixed (vec 236) Timer DCR=0x0 (divide by 2) initial_count = 0 current_count = 0 SPIV 0x000001ff APIC enabled, focus=off, spurious vec 255 ICR 0x000000fd physical edge de-assert no-shorthand ICR2 0x00000000 cpu 0 (X2APIC ID) ESR 0x00000000 ISR 236 IRR 37(level) 236 The issue isn't applicable to AMD SVM as KVM simply writes vmcb01 directly irrespective of whether L1 (vmcs01) or L2 (vmcb02) is active (unlike VMX, there is no need/cost to switch between VMCBs). In addition, APICV_INHIBIT_REASON_IRQWIN ensures AMD SVM AVIC is not activated until the last interrupt is EOI'd. Fix the bug by configuring Intel VMX GUEST_INTR_STATUS.SVI if APICv is activated at runtime. Signed-off-by: Dongli Zhang Reviewed-by: Chao Gao Link: https://patch.msgid.link/20251110063212.34902-1-dongli.zhang@oracle.com [sean: call out that SVM writes vmcb01 directly, tweak comment] Link: https://patch.msgid.link/20251205231913.441872-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 --------- arch/x86/kvm/x86.c | 7 +++++++ 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 91b6f2f3edc2..c3b9eb72b6f3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6886,15 +6886,6 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) * VM-Exit, otherwise L1 with run with a stale SVI. */ if (is_guest_mode(vcpu)) { - /* - * KVM is supposed to forward intercepted L2 EOIs to L1 if VID - * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. - * Note, userspace can stuff state while L2 is active; assert - * that VID is disabled if and only if the vCPU is in KVM_RUN - * to avoid false positives if userspace is setting APIC state. - */ - WARN_ON_ONCE(vcpu->wants_to_run && - nested_cpu_has_vid(get_vmcs12(vcpu))); to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9c2aa6f4705..82036205945f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10877,9 +10877,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was * still active when the interrupt got accepted. Make sure * kvm_check_and_inject_events() is called to check for that. + * + * Update SVI when APICv gets enabled, otherwise SVI won't reflect the + * highest bit in vISR and the next accelerated EOI in the guest won't + * be virtualized correctly (the CPU uses SVI to determine which vISR + * vector to clear). */ if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + else + kvm_apic_update_hwapic_isr(vcpu); out: preempt_enable(); -- cgit v1.2.3 From 29763138830916f46daaa50e83e7f4f907a3236b Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 5 Dec 2025 15:19:05 -0800 Subject: KVM: nVMX: Immediately refresh APICv controls as needed on nested VM-Exit If an APICv status updated was pended while L2 was active, immediately refresh vmcs01's controls instead of pending KVM_REQ_APICV_UPDATE as kvm_vcpu_update_apicv() only calls into vendor code if a change is necessary. E.g. if APICv is inhibited, and then activated while L2 is running: kvm_vcpu_update_apicv() | -> __kvm_vcpu_update_apicv() | -> apic->apicv_active = true | -> vmx_refresh_apicv_exec_ctrl() | -> vmx->nested.update_vmcs01_apicv_status = true | -> return Then L2 exits to L1: __nested_vmx_vmexit() | -> kvm_make_request(KVM_REQ_APICV_UPDATE) vcpu_enter_guest(): KVM_REQ_APICV_UPDATE -> kvm_vcpu_update_apicv() | -> __kvm_vcpu_update_apicv() | -> return // because if (apic->apicv_active == activate) Reported-by: Chao Gao Closes: https://lore.kernel.org/all/aQ2jmnN8wUYVEawF@intel.com Fixes: 7c69661e225c ("KVM: nVMX: Defer APICv updates while L2 is active until L1 is active") Cc: stable@vger.kernel.org Signed-off-by: Dongli Zhang [sean: write changelog] Link: https://patch.msgid.link/20251205231913.441872-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcea087b642f..1725c6a94f99 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -19,6 +19,7 @@ #include "trace.h" #include "vmx.h" #include "smm.h" +#include "x86_ops.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); @@ -5216,7 +5217,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vmx->nested.update_vmcs01_apicv_status) { vmx->nested.update_vmcs01_apicv_status = false; - kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + vmx_refresh_apicv_exec_ctrl(vcpu); } if (vmx->nested.update_vmcs01_hwapic_isr) { -- cgit v1.2.3 From 189e5deb944a6f9c7992355d60bffd8ec2e54a9c Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Thu, 4 Dec 2025 13:59:16 +0100 Subject: bpf, arm64: Do not audit capability check in do_jit() Analogically to the x86 commit 881a9c9cb785 ("bpf: Do not audit capability check in do_jit()"), change the capable() call to ns_capable_noaudit() in order to avoid spurious SELinux denials in audit log. The commit log from that commit applies here as well: """ The failure of this check only results in a security mitigation being applied, slightly affecting performance of the compiled BPF program. It doesn't result in a failed syscall, an thus auditing a failed LSM permission check for it is unwanted. For example with SELinux, it causes a denial to be reported for confined processes running as root, which tends to be flagged as a problem to be fixed in the policy. Yet dontauditing or allowing CAP_SYS_ADMIN to the domain may not be desirable, as it would allow/silence also other checks - either going against the principle of least privilege or making debugging potentially harder. Fix it by changing it from capable() to ns_capable_noaudit(), which instructs the LSMs to not audit the resulting denials. """ Fixes: f300769ead03 ("arm64: bpf: Only mitigate cBPF programs loaded by unprivileged users") Signed-off-by: Ondrej Mosnacek Link: https://lore.kernel.org/r/20251204125916.441021-1-omosnace@redhat.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 74dd29816f36..b6eb7a465ad2 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1004,7 +1004,7 @@ static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx) arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) return; - if (capable(CAP_SYS_ADMIN)) + if (ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) return; if (supports_clearbhb(SCOPE_SYSTEM)) { -- cgit v1.2.3 From ca45c84afb8c91a8d688b0012657099c24f59266 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 3 Dec 2025 19:32:15 -0800 Subject: bpf: Add bpf_has_frame_pointer() Introduce a bpf_has_frame_pointer() helper that unwinders can call to determine whether a given instruction pointer is within the valid frame pointer region of a BPF JIT program or trampoline (i.e., after the prologue, before the epilogue). This will enable livepatch (with the ORC unwinder) to reliably unwind through BPF JIT frames. Acked-by: Song Liu Acked-and-tested-by: Andrey Grodzovsky Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/fd2bc5b4e261a680774b28f6100509fd5ebad2f0.1764818927.git.jpoimboe@kernel.org Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Olsa --- arch/x86/net/bpf_jit_comp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b69dc7194e2c..b0bac2a66eff 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1678,6 +1678,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image emit_prologue(&prog, image, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + + bpf_prog->aux->ksym.fp_start = prog - temp; + /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame. */ @@ -2736,6 +2739,8 @@ emit_jmp: pop_r12(&prog); } EMIT1(0xC9); /* leave */ + bpf_prog->aux->ksym.fp_end = prog - temp; + emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; @@ -3325,6 +3330,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + if (im) + im->ksym.fp_start = prog - (u8 *)rw_image; + if (!is_imm8(stack_size)) { /* sub rsp, stack_size */ EMIT3_off32(0x48, 0x81, 0xEC, stack_size); @@ -3462,7 +3470,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off); + EMIT1(0xC9); /* leave */ + if (im) + im->ksym.fp_end = prog - (u8 *)rw_image; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ -- cgit v1.2.3 From 01bc3b6db18d6e0a2e93c37885996bf339bfe337 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 3 Dec 2025 19:32:16 -0800 Subject: x86/unwind/orc: Support reliable unwinding through BPF stack frames BPF JIT programs and trampolines use a frame pointer, so the current ORC unwinder strategy of falling back to frame pointers (when an ORC entry is missing) usually works in practice when unwinding through BPF JIT stack frames. However, that frame pointer fallback is just a guess, so the unwind gets marked unreliable for live patching, which can cause livepatch transition stalls. Make the common case reliable by calling the bpf_has_frame_pointer() helper to detect the valid frame pointer region of BPF JIT programs and trampolines. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reported-by: Andrey Grodzovsky Closes: https://lore.kernel.org/0e555733-c670-4e84-b2e6-abb8b84ade38@crowdstrike.com Acked-by: Song Liu Acked-and-tested-by: Andrey Grodzovsky Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/a18505975662328c8ffb1090dded890c6f8c1004.1764818927.git.jpoimboe@kernel.org Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Olsa --- arch/x86/kernel/unwind_orc.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 977ee75e047c..f610fde2d5c4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -172,6 +173,25 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip) } #endif +/* Fake frame pointer entry -- used as a fallback for generated code */ +static struct orc_entry orc_fp_entry = { + .type = ORC_TYPE_CALL, + .sp_reg = ORC_REG_BP, + .sp_offset = 16, + .bp_reg = ORC_REG_PREV_SP, + .bp_offset = -16, +}; + +static struct orc_entry *orc_bpf_find(unsigned long ip) +{ +#ifdef CONFIG_BPF_JIT + if (bpf_has_frame_pointer(ip)) + return &orc_fp_entry; +#endif + + return NULL; +} + /* * If we crash with IP==0, the last successfully executed instruction * was probably an indirect function call with a NULL function pointer, @@ -186,15 +206,6 @@ static struct orc_entry null_orc_entry = { .type = ORC_TYPE_CALL }; -/* Fake frame pointer entry -- used as a fallback for generated code */ -static struct orc_entry orc_fp_entry = { - .type = ORC_TYPE_CALL, - .sp_reg = ORC_REG_BP, - .sp_offset = 16, - .bp_reg = ORC_REG_PREV_SP, - .bp_offset = -16, -}; - static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; @@ -238,6 +249,11 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; + /* BPF lookup: */ + orc = orc_bpf_find(ip); + if (orc) + return orc; + return orc_ftrace_find(ip); } @@ -495,9 +511,8 @@ bool unwind_next_frame(struct unwind_state *state) if (!orc) { /* * As a fallback, try to assume this code uses a frame pointer. - * This is useful for generated code, like BPF, which ORC - * doesn't know about. This is just a guess, so the rest of - * the unwind is no longer considered reliable. + * This is just a guess, so the rest of the unwind is no longer + * considered reliable. */ orc = &orc_fp_entry; state->error = true; -- cgit v1.2.3 From c8161e5304abb26e6c0bec6efc947992500fa6c5 Mon Sep 17 00:00:00 2001 From: Yongxin Liu Date: Wed, 10 Dec 2025 08:02:20 +0800 Subject: x86/fpu: Fix FPU state core dump truncation on CPUs with no extended xfeatures Zero can be a valid value of num_records. For example, on Intel Atom x6425RE, only x87 and SSE are supported (features 0, 1), and fpu_user_cfg.max_features is 3. The for_each_extended_xfeature() loop only iterates feature 2, which is not enabled, so num_records = 0. This is valid and should not cause core dump failure. The issue is that dump_xsave_layout_desc() returns 0 for both genuine errors (dump_emit() failure) and valid cases (no extended features). Use negative return values for errors and only abort on genuine failures. Fixes: ba386777a30b ("x86/elf: Add a new FPU buffer layout info to x86 core files") Signed-off-by: Yongxin Liu Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251210000219.4094353-2-yongxin.liu@windriver.com --- arch/x86/kernel/fpu/xstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 48113c5193aa..76153dfb58c9 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1946,7 +1946,7 @@ static int dump_xsave_layout_desc(struct coredump_params *cprm) }; if (!dump_emit(cprm, &xc, sizeof(xc))) - return 0; + return -1; num_records++; } @@ -1984,7 +1984,7 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm) return 1; num_records = dump_xsave_layout_desc(cprm); - if (!num_records) + if (num_records < 0) return 1; /* Total size should be equal to the number of records */ -- cgit v1.2.3 From 043507144ae13d3b882d40495d101bb4c4990d98 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 10 Dec 2025 13:56:28 +0100 Subject: x86/sgx: Remove unmatched quote in __sgx_encl_extend function comment There is no opening quote. Remove the unmatched closing quote. Signed-off-by: Thorsten Blum Signed-off-by: Ingo Molnar Reviewed-by: Kai Huang Link: https://patch.msgid.link/20251210125628.544916-1-thorsten.blum@linux.dev --- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 66f1efa16fbb..9322a9287dc7 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -242,7 +242,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, /* * If the caller requires measurement of the page as a proof for the content, * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this - * operation until the entire page is measured." + * operation until the entire page is measured. */ static int __sgx_encl_extend(struct sgx_encl *encl, struct sgx_epc_page *epc_page) -- cgit v1.2.3 From 21433d3e3ca14d20f9b0c2237b3d3a1355af7907 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Fri, 12 Dec 2025 12:53:36 -0600 Subject: x86/platform/uv: Fix UBSAN array-index-out-of-bounds When UBSAN is enabled, multiple array-index-out-of-bounds messages are printed: [ 0.000000] [ T0] UBSAN: array-index-out-of-bounds in arch/x86/kernel/apic/x2apic_uv_x.c:276:23 [ 0.000000] [ T0] index 1 is out of range for type ' [1]' ... [ 0.000000] [ T0] UBSAN: array-index-out-of-bounds in arch/x86/kernel/apic/x2apic_uv_x.c:277:32 [ 0.000000] [ T0] index 1 is out of range for type ' [1]' ... [ 0.000000] [ T0] UBSAN: array-index-out-of-bounds in arch/x86/kernel/apic/x2apic_uv_x.c:282:16 [ 0.000000] [ T0] index 1 is out of range for type ' [1]' ... [ 0.515850] [ T1] UBSAN: array-index-out-of-bounds in arch/x86/kernel/apic/x2apic_uv_x.c:1344:23 [ 0.519851] [ T1] index 1 is out of range for type ' [1]' ... [ 0.603850] [ T1] UBSAN: array-index-out-of-bounds in arch/x86/kernel/apic/x2apic_uv_x.c:1345:32 [ 0.607850] [ T1] index 1 is out of range for type ' [1]' ... [ 0.691850] [ T1] UBSAN: array-index-out-of-bounds in arch/x86/kernel/apic/x2apic_uv_x.c:1353:20 [ 0.695850] [ T1] index 1 is out of range for type ' [1]' One-element arrays have been deprecated: https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays Switch entry in struct uv_systab to a flexible array member to fix UBSAN array-index-out-of-bounds messages. sizeof(struct uv_systab) is passed to early_memremap() and ioremap(). The flexible array member is not accessed until the UV system table size is used to remap the entire UV system table, so changes to sizeof(struct uv_systab) have no impact. Signed-off-by: Kyle Meyer Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/aTxksN-3otY41WvQ@hpe.com --- arch/x86/include/asm/uv/bios.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 6989b824fd32..d0b62e255290 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -122,7 +122,7 @@ struct uv_systab { struct { u32 type:8; /* type of entry */ u32 offset:24; /* byte offset from struct start to entry */ - } entry[1]; /* additional entries follow */ + } entry[]; /* additional entries follow */ }; extern struct uv_systab *uv_systab; -- cgit v1.2.3 From b1aa01d31249bd116b18c7f512d3e46b4b4ad83b Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 5 Dec 2025 10:58:57 +0100 Subject: s390/ipl: Clear SBP flag when bootprog is set With z16 a new flag 'search boot program' was introduced for list-directed IPL (SCSI, NVMe, ECKD DASD). If this flag is set, e.g. via selecting the "Automatic" value for the "Boot program selector" control on an HMC load panel, it is copied to the reipl structure from the initial ipl structure. When a user now sets a boot prog via sysfs, the flag is not cleared and the bootloader will again automatically select the boot program, ignoring user configuration. To avoid that, clear the SBP flag when a bootprog sysfs file is written. Cc: stable@vger.kernel.org Reviewed-by: Peter Oberparleiter Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/uapi/asm/ipl.h | 1 + arch/s390/kernel/ipl.c | 48 ++++++++++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index 2cd28af50dd4..3d64a2251699 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -15,6 +15,7 @@ struct ipl_pl_hdr { #define IPL_PL_FLAG_IPLPS 0x80 #define IPL_PL_FLAG_SIPL 0x40 #define IPL_PL_FLAG_IPLSR 0x20 +#define IPL_PL_FLAG_SBP 0x10 /* IPL Parameter Block header */ struct ipl_pb_hdr { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 961a3d60a4dd..dcdc7e274848 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -262,6 +262,24 @@ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) +#define DEFINE_IPL_ATTR_BOOTPROG_RW(_prefix, _name, _fmt_out, _fmt_in, _hdr, _value) \ + IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long value; \ + if (sscanf(buf, _fmt_in, &value) != 1) \ + return -EINVAL; \ + (_value) = value; \ + (_hdr).flags &= ~IPL_PL_FLAG_SBP; \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0644, \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + #define DEFINE_IPL_ATTR_STR_RW(_prefix, _name, _fmt_out, _fmt_in, _value)\ IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, _value) \ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ @@ -818,12 +836,13 @@ DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", reipl_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", reipl_block_fcp->fcp.lun); -DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", - reipl_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", reipl_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", reipl_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", + reipl_block_fcp->hdr, + reipl_block_fcp->fcp.bootprog); static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) @@ -942,10 +961,11 @@ DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n", reipl_block_nvme->nvme.fid); DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n", reipl_block_nvme->nvme.nsid); -DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", - reipl_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", reipl_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", + reipl_block_nvme->hdr, + reipl_block_nvme->nvme.bootprog); static struct attribute *reipl_nvme_attrs[] = { &sys_reipl_nvme_fid_attr.attr, @@ -1038,8 +1058,9 @@ static const struct bin_attribute *const reipl_eckd_bin_attrs[] = { }; DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); -DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", - reipl_block_eckd->eckd.bootprog); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->hdr, + reipl_block_eckd->eckd.bootprog); static struct attribute *reipl_eckd_attrs[] = { &sys_reipl_eckd_device_attr.attr, @@ -1567,12 +1588,13 @@ DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", dump_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", dump_block_fcp->fcp.lun); -DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", - dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", dump_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", dump_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", + dump_block_fcp->hdr, + dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr, dump_block_fcp->fcp, @@ -1604,10 +1626,11 @@ DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n", dump_block_nvme->nvme.fid); DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n", dump_block_nvme->nvme.nsid); -DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", - dump_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", dump_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", + dump_block_nvme->hdr, + dump_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr, dump_block_nvme->nvme, @@ -1635,8 +1658,9 @@ static const struct attribute_group dump_nvme_attr_group = { /* ECKD dump device attributes */ DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); -DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", - dump_block_eckd->eckd.bootprog); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->hdr, + dump_block_eckd->eckd.bootprog); IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); -- cgit v1.2.3 From 4cb92fa763823d813d22b45b7f18fcf6e85a72ad Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Fri, 5 Dec 2025 16:47:17 +0100 Subject: s390/pci: Fix cyclic dead-lock in zpci_zdev_put() and zpci_scan_devices() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When triggering PCI device recovery by writing into the SysFS attribute `recover` of a Physical Function with existing child SR-IOV Virtual Functions, lockdep is reporting a possible deadlock between three threads: Thread (A) Thread (B) Thread (C) | | | recover_store() zpci_scan_devices() zpci_scan_devices() lock(pci_rescan_remove_lock) | | | | | | | zpci_bus_scan_busses() | | lock(zbus_list_lock) | zpci_add_device() | | lock(zpci_add_remove_lock) | | | ┴ | | zpci_bus_scan_bus() | | lock(pci_rescan_remove_lock) ┴ | zpci_zdev_put() | lock(zpci_add_remove_lock) | ┴ zpci_bus_get() lock(zbus_list_lock) In zpci_bus_scan_busses() the `zbus_list_lock` is taken for the whole duration of the function, which also includes taking `pci_rescan_remove_lock`, among other things. But `zbus_list_lock` only really needs to protect the modification of the global registration `zbus_list`, it can be dropped while the functions within the list iteration run; this way we break the cycle above. Break up zpci_bus_scan_busses() into an "iterator" zpci_bus_get_next() that iterates over `zbus_list` element by element, and acquires and releases `zbus_list_lock` as necessary, but never keep holding it. References to `zpci_bus` objects are also acquired and released. The reference counting on `zpci_bus` objects is also changed so that all put() and get() operations are done under the protection of `zbus_list_lock`, and if the operation results in a modification of `zpci_bus_list`, this modification is done in the same critical section (apart the very first initialization). This way objects are never seen on the list that are about to be released and/or half-initialized. Fixes: 14c87ba8123a ("s390/pci: separate zbus registration from scanning") Suggested-by: Niklas Schnelle Signed-off-by: Benjamin Block Reviewed-by: Niklas Schnelle Reviewed-by: Gerd Bayer Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 6 ++- arch/s390/pci/pci_bus.c | 98 +++++++++++++++++++++++++++++++++++-------------- arch/s390/pci/pci_bus.h | 15 +++++++- 3 files changed, 90 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 5a6ace9d875a..8fd14d043008 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -1148,6 +1148,7 @@ static void zpci_add_devices(struct list_head *scan_list) int zpci_scan_devices(void) { + struct zpci_bus *zbus; LIST_HEAD(scan_list); int rc; @@ -1156,7 +1157,10 @@ int zpci_scan_devices(void) return rc; zpci_add_devices(&scan_list); - zpci_bus_scan_busses(); + zpci_bus_for_each(zbus) { + zpci_bus_scan_bus(zbus); + cond_resched(); + } return 0; } diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 66c4bd888b29..42a13e451f64 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -153,23 +153,6 @@ int zpci_bus_scan_bus(struct zpci_bus *zbus) return ret; } -/* zpci_bus_scan_busses - Scan all registered busses - * - * Scan all available zbusses - * - */ -void zpci_bus_scan_busses(void) -{ - struct zpci_bus *zbus = NULL; - - mutex_lock(&zbus_list_lock); - list_for_each_entry(zbus, &zbus_list, bus_next) { - zpci_bus_scan_bus(zbus); - cond_resched(); - } - mutex_unlock(&zbus_list_lock); -} - static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) { return !s390_pci_no_rid && zdev->rid_available && @@ -222,10 +205,29 @@ out_free_domain: return -ENOMEM; } -static void zpci_bus_release(struct kref *kref) +/** + * zpci_bus_release - Un-initialize resources associated with the zbus and + * free memory + * @kref: refcount * that is part of struct zpci_bus + * + * MUST be called with `zbus_list_lock` held, but the lock is released during + * run of the function. + */ +static inline void zpci_bus_release(struct kref *kref) + __releases(&zbus_list_lock) { struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); + lockdep_assert_held(&zbus_list_lock); + + list_del(&zbus->bus_next); + mutex_unlock(&zbus_list_lock); + + /* + * At this point no-one should see this object, or be able to get a new + * reference to it. + */ + if (zbus->bus) { pci_lock_rescan_remove(); pci_stop_root_bus(zbus->bus); @@ -237,16 +239,19 @@ static void zpci_bus_release(struct kref *kref) pci_unlock_rescan_remove(); } - mutex_lock(&zbus_list_lock); - list_del(&zbus->bus_next); - mutex_unlock(&zbus_list_lock); zpci_remove_parent_msi_domain(zbus); kfree(zbus); } -static void zpci_bus_put(struct zpci_bus *zbus) +static inline void __zpci_bus_get(struct zpci_bus *zbus) +{ + lockdep_assert_held(&zbus_list_lock); + kref_get(&zbus->kref); +} + +static inline void zpci_bus_put(struct zpci_bus *zbus) { - kref_put(&zbus->kref, zpci_bus_release); + kref_put_mutex(&zbus->kref, zpci_bus_release, &zbus_list_lock); } static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) @@ -258,7 +263,7 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) if (!zbus->multifunction) continue; if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { - kref_get(&zbus->kref); + __zpci_bus_get(zbus); goto out_unlock; } } @@ -268,6 +273,44 @@ out_unlock: return zbus; } +/** + * zpci_bus_get_next - get the next zbus object from given position in the list + * @pos: current position/cursor in the global zbus list + * + * Acquires and releases references as the cursor iterates (might also free/ + * release the cursor). Is tolerant of concurrent operations on the list. + * + * To begin the iteration, set *@pos to %NULL before calling the function. + * + * *@pos is set to %NULL in cases where either the list is empty, or *@pos is + * the last element in the list. + * + * Context: Process context. May sleep. + */ +void zpci_bus_get_next(struct zpci_bus **pos) +{ + struct zpci_bus *curp = *pos, *next = NULL; + + mutex_lock(&zbus_list_lock); + if (curp) + next = list_next_entry(curp, bus_next); + else + next = list_first_entry(&zbus_list, typeof(*curp), bus_next); + + if (list_entry_is_head(next, &zbus_list, bus_next)) + next = NULL; + + if (next) + __zpci_bus_get(next); + + *pos = next; + mutex_unlock(&zbus_list_lock); + + /* zpci_bus_put() might drop refcount to 0 and locks zbus_list_lock */ + if (curp) + zpci_bus_put(curp); +} + static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) { struct zpci_bus *zbus; @@ -279,9 +322,6 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) zbus->topo = topo; zbus->topo_is_tid = topo_is_tid; INIT_LIST_HEAD(&zbus->bus_next); - mutex_lock(&zbus_list_lock); - list_add_tail(&zbus->bus_next, &zbus_list); - mutex_unlock(&zbus_list_lock); kref_init(&zbus->kref); INIT_LIST_HEAD(&zbus->resources); @@ -291,6 +331,10 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) zbus->bus_resource.flags = IORESOURCE_BUS; pci_add_resource(&zbus->resources, &zbus->bus_resource); + mutex_lock(&zbus_list_lock); + list_add_tail(&zbus->bus_next, &zbus_list); + mutex_unlock(&zbus_list_lock); + return zbus; } diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h index ae3d7a9159bd..e440742e3145 100644 --- a/arch/s390/pci/pci_bus.h +++ b/arch/s390/pci/pci_bus.h @@ -15,7 +15,20 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); void zpci_bus_device_unregister(struct zpci_dev *zdev); int zpci_bus_scan_bus(struct zpci_bus *zbus); -void zpci_bus_scan_busses(void); +void zpci_bus_get_next(struct zpci_bus **pos); + +/** + * zpci_bus_for_each - iterate over all the registered zbus objects + * @pos: a struct zpci_bus * as cursor + * + * Acquires and releases references as the cursor iterates over the registered + * objects. Is tolerant against concurrent removals of objects. + * + * Context: Process context. May sleep. + */ +#define zpci_bus_for_each(pos) \ + for ((pos) = NULL, zpci_bus_get_next(&(pos)); (pos) != NULL; \ + zpci_bus_get_next(&(pos))) int zpci_bus_scan_device(struct zpci_dev *zdev); void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error); -- cgit v1.2.3 From af241e6bfc11125e6669dabf0800fce6809dd3cf Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Fri, 5 Dec 2025 16:47:18 +0100 Subject: s390/pci: Annotate lock context imbalance in zpci_release_device() When checking `arch/s390/pci/pci.c` with `sparse` during build, the following complaint is reported: arch/s390/pci/pci.c: note: in included file (through include/linux/smp.h, include/linux/lockdep.h, include/linux/spinlock.h, include/linux/mmzone.h, include/linux/gfp.h, include/linux/slab.h): ./include/linux/list.h:237:25: warning: context imbalance in 'zpci_release_device' - unexpected unlock But this is expected, as zpci_release_device() is expected to be called with `zpci_list_lock` held, as part of `kref_put_lock()` or similar. Reflect this by annotating the function with the appropriate __releases(). Signed-off-by: Benjamin Block Reviewed-by: Farhan Ali Reviewed-by: Niklas Schnelle Reviewed-by: Gerd Bayer Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 8fd14d043008..57f3980b98a9 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -961,6 +961,7 @@ void zpci_device_reserved(struct zpci_dev *zdev) } void zpci_release_device(struct kref *kref) + __releases(&zpci_list_lock) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); -- cgit v1.2.3 From 489e96651dfe59794195c6b2ddb78835edd9f2ed Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Thu, 11 Dec 2025 12:24:50 +0100 Subject: s390/stacktrace: Do not fallback to RA register The logic to fallback to the return address (RA) register value in the topmost frame when stack tracing using back chain is broken in multiple ways: When assuming the RA register 14 has not been saved yet one must assume that a new user stack frame has not been allocated either. Therefore the back chain would not contain the stack pointer (SP) at entry, but the caller's SP at its entry instead. Therefore when falling back to the RA register 14 value it would also be necessary to fallback to the SP register 15 value. Otherwise an invalid combination of RA register 14 and caller's SP at its entry (from the back chain) is used. In the topmost frame the back chain contains either the caller's SP at its entry (before having allocated a new stack frame in the prologue), the SP at entry (after having allocated a new stack frame), or an uninitialized value (during static/dynamic stack allocation). In both cases where the back chain is valid either the caller or prologue must have saved its respective RA to the respective frame. Therefore, if the RA obtained from the frame pointed to by the back chain is invalid, this does not indicate that the IP in the topmost frame is still early in the prologue and the RA has not been saved. Reviewed-by: Heiko Carstens Signed-off-by: Jens Remus Signed-off-by: Heiko Carstens --- arch/s390/kernel/stacktrace.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 3aae7f70e6ab..18520d333058 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -104,7 +104,6 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo struct stack_frame_vdso_wrapper __user *sf_vdso; struct stack_frame_user __user *sf; unsigned long ip, sp; - bool first = true; if (!current->mm) return; @@ -133,24 +132,11 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo if (__get_user(ip, &sf->gprs[8])) break; } - /* Sanity check: ABI requires SP to be 8 byte aligned. */ - if (sp & 0x7) + /* Validate SP and RA (ABI requires SP to be 8 byte aligned). */ + if (sp & 0x7 || ip_invalid(ip)) break; - if (ip_invalid(ip)) { - /* - * If the instruction address is invalid, and this - * is the first stack frame, assume r14 has not - * been written to the stack yet. Otherwise exit. - */ - if (!first) - break; - ip = regs->gprs[14]; - if (ip_invalid(ip)) - break; - } if (!store_ip(consume_entry, cookie, entry, perf, ip)) break; - first = false; } pagefault_enable(); } -- cgit v1.2.3 From c4b502d60a71cf0c0c938f133dc4c0e2adc17b44 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 9 Dec 2025 06:48:49 +0100 Subject: arm64/simd: Avoid pointless clearing of FP/SIMD buffer The buffer provided to kernel_neon_begin() is only used if the task is scheduled out while the FP/SIMD is in use by the kernel, or when such a section is interrupted by a softirq that also uses the FP/SIMD. IOW, this happens rarely, and even if it happened often, there is still no reason for this buffer to be cleared beforehand, which happens unconditionally, due to the use of a compound literal expression. So define that buffer variable explicitly, and mark it as __uninitialized so that it will not get cleared, even when -ftrivial-auto-var-init is in effect. This requires some preprocessor gymnastics, due to the fact that the variable must be defined throughout the entire guarded scope, and the expression ({ struct user_fpsimd_state __uninitialized st; &st; }) is problematic in that regard, even though the compilers seem to permit it. So instead, repeat the 'for ()' trick that is also used in the implementation of the guarded scope helpers. Cc: Will Deacon Cc: Catalin Marinas Cc: Kees Cook Cc: Eric Biggers Signed-off-by: Ard Biesheuvel Fixes: 4fa617cc6851 ("arm64/fpsimd: Allocate kernel mode FP/SIMD buffers on the stack") Link: https://lore.kernel.org/r/20251209054848.998878-2-ardb@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/include/asm/simd.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h index 0941f6f58a14..69ecbd69ca8c 100644 --- a/arch/arm64/include/asm/simd.h +++ b/arch/arm64/include/asm/simd.h @@ -48,6 +48,13 @@ DEFINE_LOCK_GUARD_1(ksimd, kernel_neon_begin(_T->lock), kernel_neon_end(_T->lock)) -#define scoped_ksimd() scoped_guard(ksimd, &(struct user_fpsimd_state){}) +#define __scoped_ksimd(_label) \ + for (struct user_fpsimd_state __uninitialized __st; \ + true; ({ goto _label; })) \ + if (0) { \ +_label: break; \ + } else scoped_guard(ksimd, &__st) + +#define scoped_ksimd() __scoped_ksimd(__UNIQUE_ID(label)) #endif -- cgit v1.2.3 From b7737c38e7cb611c2fbd87af3b09afeb92c96fe7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 19 Nov 2025 13:00:16 +0000 Subject: arm64: mm: Simplify check in arch_kfence_init_pool() TL;DR: checking force_pte_mapping() in arch_kfence_init_pool() is sufficient Commit ce2b3a50ad92 ("arm64: mm: Don't sleep in split_kernel_leaf_mapping() when in atomic context") recently added an arm64 implementation of arch_kfence_init_pool() to ensure that the KFENCE pool is PTE-mapped. Assuming that the pool was not initialised early, block splitting is necessary if the linear mapping is not fully PTE-mapped, in other words if force_pte_mapping() is false. arch_kfence_init_pool() currently makes another check: whether BBML2-noabort is supported, i.e. whether we are *able* to split block mappings. This check is however unnecessary, because force_pte_mapping() is always true if KFENCE is enabled and BBML2-noabort is not supported. This must be the case by design, since KFENCE requires PTE-mapped pages in all cases. We can therefore remove that check. The situation is different in split_kernel_leaf_mapping(), as that function is called unconditionally regardless of the configuration. If BBML2-noabort is not supported, it cannot do anything and bails out. If force_pte_mapping() is true, there is nothing to do and it also bails out, but these are independent checks. Commit 53357f14f924 ("arm64: mm: Tidy up force_pte_mapping()") grouped these checks into a helper, split_leaf_mapping_possible(). This isn't so helpful as only split_kernel_leaf_mapping() should check both. Revert the parts of that commit that introduced the helper, reintroducing the more accurate comments in split_kernel_leaf_mapping(). Signed-off-by: Kevin Brodsky Reviewed-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9ae7ce00a7ef..8e1d80a7033e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -767,18 +767,6 @@ static inline bool force_pte_mapping(void) return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); } -static inline bool split_leaf_mapping_possible(void) -{ - /* - * !BBML2_NOABORT systems should never run into scenarios where we would - * have to split. So exit early and let calling code detect it and raise - * a warning. - */ - if (!system_supports_bbml2_noabort()) - return false; - return !force_pte_mapping(); -} - static DEFINE_MUTEX(pgtable_split_lock); int split_kernel_leaf_mapping(unsigned long start, unsigned long end) @@ -786,11 +774,22 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) int ret; /* - * Exit early if the region is within a pte-mapped area or if we can't - * split. For the latter case, the permission change code will raise a - * warning if not already pte-mapped. + * !BBML2_NOABORT systems should not be trying to change permissions on + * anything that is not pte-mapped in the first place. Just return early + * and let the permission change code raise a warning if not already + * pte-mapped. */ - if (!split_leaf_mapping_possible() || is_kfence_address((void *)start)) + if (!system_supports_bbml2_noabort()) + return 0; + + /* + * If the region is within a pte-mapped area, there is no need to try to + * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may + * change permissions from atomic context so for those cases (which are + * always pte-mapped), we must not go any further because taking the + * mutex below may sleep. + */ + if (force_pte_mapping() || is_kfence_address((void *)start)) return 0; /* @@ -1089,7 +1088,7 @@ bool arch_kfence_init_pool(void) int ret; /* Exit early if we know the linear map is already pte-mapped. */ - if (!split_leaf_mapping_possible()) + if (force_pte_mapping()) return true; /* Kfence pool is already pte-mapped for the early init case. */ -- cgit v1.2.3 From 63de2b3859ba1def9f43ed0a9c25a68810208e5c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 6 Dec 2025 20:01:17 +0100 Subject: arm64/efi: Remove unneeded SVE/SME fallback preserve/store handling Since commit 7137a203b251 ("arm64/fpsimd: Permit kernel mode NEON with IRQs off"), the only condition under which the fallback path is taken for FP/SIMD preserve/restore across a EFI runtime call is when it is called from hardirq or NMI context. In practice, this only happens when the EFI pstore driver is called to dump the kernel log buffer into a EFI variable under a panic, oops or emergency_restart() condition, and none of these can be expected to result in a return to user space for the task in question. This means that the existing EFI-specific logic for preserving and restoring SVE/SME state is pointless, and can be removed. Instead, kill the task, so that an exceedingly unlikely inadvertent return to user space does not proceed with a corrupted FP/SIMD state. Also, retain the preserve and restore of the base FP/SIMD state, as that might belong to kernel mode use of FP/SIMD. (Note that EFI runtime calls are never invoked reentrantly, even in this case, and so any interrupted kernel mode FP/SIMD usage will be unrelated to EFI) Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 130 +++++++-------------------------------------- 1 file changed, 20 insertions(+), 110 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c154f72634e0..9de1d8a604cb 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -180,13 +180,6 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static u8 *efi_sve_state; - -#else /* ! CONFIG_ARM64_SVE */ - -/* Dummy declaration for code that will be optimised out: */ -extern u8 *efi_sve_state; - #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME @@ -1095,36 +1088,6 @@ int vec_verify_vq_map(enum vec_type type) return 0; } -static void __init sve_efi_setup(void) -{ - int max_vl = 0; - int i; - - if (!IS_ENABLED(CONFIG_EFI)) - return; - - for (i = 0; i < ARRAY_SIZE(vl_info); i++) - max_vl = max(vl_info[i].max_vl, max_vl); - - /* - * alloc_percpu() warns and prints a backtrace if this goes wrong. - * This is evidence of a crippled system and we are returning void, - * so no attempt is made to handle this situation here. - */ - if (!sve_vl_valid(max_vl)) - goto fail; - - efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), - GFP_KERNEL); - if (!efi_sve_state) - goto fail; - - return; - -fail: - panic("Cannot allocate memory for EFI SVE save/restore"); -} - void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); @@ -1185,8 +1148,6 @@ void __init sve_setup(void) if (sve_max_virtualisable_vl() < sve_max_vl()) pr_warn("%s: unvirtualisable vector lengths present\n", info->name); - - sve_efi_setup(); } /* @@ -1947,9 +1908,6 @@ EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI static struct user_fpsimd_state efi_fpsimd_state; -static bool efi_fpsimd_state_used; -static bool efi_sve_state_used; -static bool efi_sm_state; /* * EFI runtime services support functions @@ -1976,43 +1934,26 @@ void __efi_fpsimd_begin(void) if (may_use_simd()) { kernel_neon_begin(&efi_fpsimd_state); } else { - WARN_ON(preemptible()); - /* - * If !efi_sve_state, SVE can't be in use yet and doesn't need - * preserving: + * We are running in hardirq or NMI context, and the only + * legitimate case where this might happen is when EFI pstore + * is attempting to record the system's dying gasps into EFI + * variables. This could be due to an oops, a panic or a call + * to emergency_restart(), and in none of those cases, we can + * expect the current task to ever return to user space again, + * or for the kernel to resume any normal execution, for that + * matter (an oops in hardirq context triggers a panic too). + * + * Therefore, there is no point in attempting to preserve any + * SVE/SME state here. On the off chance that we might have + * ended up here for a different reason inadvertently, kill the + * task and preserve/restore the base FP/SIMD state, which + * might belong to kernel mode FP/SIMD. */ - if (system_supports_sve() && efi_sve_state != NULL) { - bool ffr = true; - u64 svcr; - - efi_sve_state_used = true; - - if (system_supports_sme()) { - svcr = read_sysreg_s(SYS_SVCR); - - efi_sm_state = svcr & SVCR_SM_MASK; - - /* - * Unless we have FA64 FFR does not - * exist in streaming mode. - */ - if (!system_supports_fa64()) - ffr = !(svcr & SVCR_SM_MASK); - } - - sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), - &efi_fpsimd_state.fpsr, ffr); - - if (system_supports_sme()) - sysreg_clear_set_s(SYS_SVCR, - SVCR_SM_MASK, 0); - - } else { - fpsimd_save_state(&efi_fpsimd_state); - } - - efi_fpsimd_state_used = true; + pr_warn_ratelimited("Calling EFI runtime from %s context\n", + in_nmi() ? "NMI" : "hardirq"); + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + fpsimd_save_state(&efi_fpsimd_state); } } @@ -2024,41 +1965,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!efi_fpsimd_state_used) { + if (may_use_simd()) { kernel_neon_end(&efi_fpsimd_state); } else { - if (system_supports_sve() && efi_sve_state_used) { - bool ffr = true; - - /* - * Restore streaming mode; EFI calls are - * normal function calls so should not return in - * streaming mode. - */ - if (system_supports_sme()) { - if (efi_sm_state) { - sysreg_clear_set_s(SYS_SVCR, - 0, - SVCR_SM_MASK); - - /* - * Unless we have FA64 FFR does not - * exist in streaming mode. - */ - if (!system_supports_fa64()) - ffr = false; - } - } - - sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), - &efi_fpsimd_state.fpsr, ffr); - - efi_sve_state_used = false; - } else { - fpsimd_load_state(&efi_fpsimd_state); - } - - efi_fpsimd_state_used = false; + fpsimd_load_state(&efi_fpsimd_state); } } -- cgit v1.2.3 From 98a97bf41528ef738b06eb07ec2b2eb1cfde6ce6 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 29 Nov 2025 00:48:45 +0000 Subject: arm64/gcs: Flush the GCS locking state on exec When we exec a new task we forget to flush the set of locked GCS mode bits. Since we do flush the rest of the state this means that if GCS is locked the new task will be unable to enable GCS, it will be locked as being disabled. Add the expected flush. Fixes: fc84bc5378a8 ("arm64/gcs: Context switch GCS state for EL0") Cc: # 6.13.x Reported-by: Yury Khrustalev Signed-off-by: Mark Brown Tested-by: Yury Khrustalev Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index fba7ca102a8c..489554931231 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -292,6 +292,7 @@ static void flush_gcs(void) current->thread.gcs_base = 0; current->thread.gcs_size = 0; current->thread.gcs_el0_mode = 0; + current->thread.gcs_el0_locked = 0; write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); write_sysreg_s(0, SYS_GCSPR_EL0); } -- cgit v1.2.3 From bd94fbe8b55f38c24a63cca2854ff74b62780d77 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 29 Oct 2025 16:03:16 +0100 Subject: MIPS: Alchemy: Remove bogus static/inline specifiers The recent io_remap_pfn_range() rework applied the static and inline specifiers to the implementation of io_remap_pfn_range_pfn() on MIPS Alchemy, mirroring the same change on other platforms. However, this function is defined in a source file and that definition causes a conflict with its declaration. Fix this by dropping the specifiers. Fixes: c707a68f9468 ("mm: abstract io_remap_pfn_range() based on PFN") Signed-off-by: Thierry Reding Acked-by: Thomas Bogendoerfer Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- arch/mips/alchemy/common/setup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c index c35b4f809d51..992134a8c23a 100644 --- a/arch/mips/alchemy/common/setup.c +++ b/arch/mips/alchemy/common/setup.c @@ -94,8 +94,7 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size) return phys_addr; } -static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, - unsigned long size) +unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size) { phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size); -- cgit v1.2.3 From 680ad315caaa2860df411cb378bf3614d96c7648 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Thu, 4 Dec 2025 18:36:18 +0800 Subject: MIPS: Fix a reference leak bug in ip22_check_gio() If gio_device_register fails, gio_dev_put() is required to drop the gio_dev device reference. Fixes: e84de0c61905 ("MIPS: GIO bus support for SGI IP22/28") Signed-off-by: Haoxiang Li Signed-off-by: Thomas Bogendoerfer --- arch/mips/sgi-ip22/ip22-gio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/sgi-ip22/ip22-gio.c b/arch/mips/sgi-ip22/ip22-gio.c index 5893ea4e382c..19b70928d6dc 100644 --- a/arch/mips/sgi-ip22/ip22-gio.c +++ b/arch/mips/sgi-ip22/ip22-gio.c @@ -372,7 +372,8 @@ static void ip22_check_gio(int slotno, unsigned long addr, int irq) gio_dev->resource.flags = IORESOURCE_MEM; gio_dev->irq = irq; dev_set_name(&gio_dev->dev, "%d", slotno); - gio_device_register(gio_dev); + if (gio_device_register(gio_dev)) + gio_dev_put(gio_dev); } else printk(KERN_INFO "GIO: slot %d : Empty\n", slotno); } -- cgit v1.2.3 From e5aff444e3a7bdeef5ea796a2099fc3c60a070fa Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 15 Dec 2025 12:51:12 +0100 Subject: x86/xen: Fix sparse warning in enlighten_pv.c The sparse tool issues a warning for arch/x76/xen/enlighten_pv.c: arch/x86/xen/enlighten_pv.c:120:9: sparse: sparse: incorrect type in initializer (different address spaces) expected void const [noderef] __percpu *__vpp_verify got bool * This is due to the percpu variable xen_in_preemptible_hcall being exported via EXPORT_SYMBOL_GPL() instead of EXPORT_PER_CPU_SYMBOL_GPL(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512140856.Ic6FetG6-lkp@intel.com/ Fixes: fdfd811ddde3 ("x86/xen: allow privcmd hypercalls to be preempted") Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross Message-ID: <20251215115112.15072-1-jgross@suse.com> --- arch/x86/xen/enlighten_pv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4806cc28d7ca..b74ff8bc7f2a 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -108,7 +108,7 @@ static int xen_cpu_dead_pv(unsigned int cpu); * calls. */ DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); -EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); +EXPORT_PER_CPU_SYMBOL_GPL(xen_in_preemptible_hcall); /* * In case of scheduling the flag must be cleared and restored after -- cgit v1.2.3 From 0edc78b82bea85e1b2165d8e870a5c3535919695 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Nov 2025 22:50:45 +0100 Subject: x86/msi: Make irq_retrigger() functional for posted MSI Luigi reported that retriggering a posted MSI interrupt does not work correctly. The reason is that the retrigger happens at the vector domain by sending an IPI to the actual vector on the target CPU. That works correctly exactly once because the posted MSI interrupt chip does not issue an EOI as that's only required for the posted MSI notification vector itself. As a consequence the vector becomes stale in the ISR, which not only affects this vector but also any lower priority vector in the affected APIC because the ISR bit is not cleared. Luigi proposed to set the vector in the remap PIR bitmap and raise the posted MSI notification vector. That works, but that still does not cure a related problem: If there is ever a stray interrupt on such a vector, then the related APIC ISR bit becomes stale due to the lack of EOI as described above. Unlikely to happen, but if it happens it's not debuggable at all. So instead of playing games with the PIR, this can be actually solved for both cases by: 1) Keeping track of the posted interrupt vector handler state 2) Implementing a posted MSI specific irq_ack() callback which checks that state. If the posted vector handler is inactive it issues an EOI, otherwise it delegates that to the posted handler. This is correct versus affinity changes and concurrent events on the posted vector as the actual handler invocation is serialized through the interrupt descriptor lock. Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs") Reported-by: Luigi Rizzo Signed-off-by: Thomas Gleixner Tested-by: Luigi Rizzo Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251125214631.044440658@linutronix.de Closes: https://lore.kernel.org/lkml/20251124104836.3685533-1-lrizzo@google.com --- arch/x86/include/asm/irq_remapping.h | 7 +++++++ arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 5a0d42464d44..4e55d1755846 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -87,4 +87,11 @@ static inline void panic_if_irq_remap(const char *msg) } #endif /* CONFIG_IRQ_REMAP */ + +#ifdef CONFIG_X86_POSTED_MSI +void intel_ack_posted_msi_irq(struct irq_data *irqd); +#else +#define intel_ack_posted_msi_irq NULL +#endif + #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 86f4e574de02..b2fe6181960c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -397,6 +397,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) /* Posted Interrupt Descriptors for coalesced MSIs to be posted */ DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); +static DEFINE_PER_CPU_CACHE_HOT(bool, posted_msi_handler_active); void intel_posted_msi_init(void) { @@ -414,6 +415,25 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } +void intel_ack_posted_msi_irq(struct irq_data *irqd) +{ + irq_move_irq(irqd); + + /* + * Handle the rare case that irq_retrigger() raised the actual + * assigned vector on the target CPU, which means that it was not + * invoked via the posted MSI handler below. In that case APIC EOI + * is required as otherwise the ISR entry becomes stale and lower + * priority interrupts are never going to be delivered after that. + * + * If the posted handler invoked the device interrupt handler then + * the EOI would be premature because it would acknowledge the + * posted vector. + */ + if (unlikely(!__this_cpu_read(posted_msi_handler_active))) + apic_eoi(); +} + static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { unsigned long pir_copy[NR_PIR_WORDS]; @@ -446,6 +466,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) pid = this_cpu_ptr(&posted_msi_pi_desc); + /* Mark the handler active for intel_ack_posted_msi_irq() */ + __this_cpu_write(posted_msi_handler_active, true); inc_irq_stat(posted_msi_notification_count); irq_enter(); @@ -474,6 +496,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) apic_eoi(); irq_exit(); + __this_cpu_write(posted_msi_handler_active, false); set_irq_regs(old_regs); } #endif /* X86_POSTED_MSI */ -- cgit v1.2.3 From c56a12c71ad38f381105f6e5036dede64ad2dfee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Dec 2025 11:47:38 +0100 Subject: x86/bug: Fix old GCC compile fails For some mysterious reasons the GCC 8 and 9 preprocessor manages to sporadically fumble _ASM_BYTES(0x0f, 0x0b): $ grep ".byte[ ]*0x0f" defconfig-build/drivers/net/wireless/realtek/rtlwifi/base.s 1: .byte0x0f,0x0b ; 1: .byte 0x0f,0x0b ; which makes the assembler upset and all that. While there are more _ASM_BYTES() users (notably the NOP instructions), those don't seem affected. Therefore replace the offending ASM_UD2 with one using the ud2 mnemonic. Reported-by: Jean Delvare Suggested-by: Uros Bizjak Fixes: 85a2d4a890dc ("x86,ibt: Use UDB instead of 0xEA") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251218104659.GT3911114@noisy.programming.kicks-ass.net --- arch/x86/include/asm/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index ee23b98353d7..40de5796adb5 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -15,7 +15,7 @@ extern void __WARN_trap(struct bug_entry *bug, ...); /* * Despite that some emulators terminate on UD2, we use it for WARN(). */ -#define ASM_UD2 _ASM_BYTES(0x0f, 0x0b) +#define ASM_UD2 __ASM_FORM(ud2) #define INSN_UD2 0x0b0f #define LEN_UD2 2 -- cgit v1.2.3 From 818d78ba1b3f88d2bfee249f25020211488a26c3 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Wed, 12 Nov 2025 16:43:14 -0800 Subject: riscv: signal: abstract header saving for setup_sigcontext The function save_v_state() served two purposes. First, it saved extension context into the signal stack. Then, it constructed the extension header if there was no fault. The second part is independent of the extension itself. As a result, we can pull that part out, so future extensions may reuse it. This patch adds arch_ext_list and makes setup_sigcontext() go through all possible extensions' save() callback. The callback returns a positive value indicating the size of the successfully saved extension. Then the kernel proceeds to construct the header for that extension. The kernel skips an extension if it does not exist, or if the saving fails for some reasons. The error code is propagated out on the later case. This patch does not introduce any functional changes. Signed-off-by: Andy Chiu Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-16-b55691eacf4f@rivosinc.com Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/vector.h | 3 ++ arch/riscv/kernel/signal.c | 62 +++++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index e7aa449368ad..00cb9c0982b1 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -424,6 +424,9 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_thread_free(tsk) do {} while (0) #define riscv_v_setup_ctx_cache() do {} while (0) #define riscv_v_thread_alloc(tsk) do {} while (0) +#define get_cpu_vector_context() do {} while (0) +#define put_cpu_vector_context() do {} while (0) +#define riscv_v_vstate_set_restore(task, regs) do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 08378fea3a11..5a956108b1ea 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -68,18 +68,19 @@ static long save_fp_state(struct pt_regs *regs, #define restore_fp_state(task, regs) (0) #endif -#ifdef CONFIG_RISCV_ISA_V - -static long save_v_state(struct pt_regs *regs, void __user **sc_vec) +static long save_v_state(struct pt_regs *regs, void __user *sc_vec) { - struct __riscv_ctx_hdr __user *hdr; struct __sc_riscv_v_state __user *state; void __user *datap; long err; - hdr = *sc_vec; - /* Place state to the user's signal context space after the hdr */ - state = (struct __sc_riscv_v_state __user *)(hdr + 1); + if (!IS_ENABLED(CONFIG_RISCV_ISA_V) || + !((has_vector() || has_xtheadvector()) && + riscv_v_vstate_query(regs))) + return 0; + + /* Place state to the user's signal context space */ + state = (struct __sc_riscv_v_state __user *)sc_vec; /* Point datap right after the end of __sc_riscv_v_state */ datap = state + 1; @@ -97,15 +98,11 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) err |= __put_user((__force void *)datap, &state->v_state.datap); /* Copy the whole vector content to user space datap. */ err |= __copy_to_user(datap, current->thread.vstate.datap, riscv_v_vsize); - /* Copy magic to the user space after saving all vector conetext */ - err |= __put_user(RISCV_V_MAGIC, &hdr->magic); - err |= __put_user(riscv_v_sc_size, &hdr->size); if (unlikely(err)) - return err; + return -EFAULT; - /* Only progress the sv_vec if everything has done successfully */ - *sc_vec += riscv_v_sc_size; - return 0; + /* Only return the size if everything has done successfully */ + return riscv_v_sc_size; } /* @@ -142,10 +139,20 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) */ return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } -#else -#define save_v_state(task, regs) (0) -#define __restore_v_state(task, regs) (0) -#endif + +struct arch_ext_priv { + __u32 magic; + long (*save)(struct pt_regs *regs, void __user *sc_vec); +}; + +struct arch_ext_priv arch_ext_list[] = { + { + .magic = RISCV_V_MAGIC, + .save = &save_v_state, + }, +}; + +const size_t nr_arch_exts = ARRAY_SIZE(arch_ext_list); static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) @@ -270,7 +277,8 @@ static long setup_sigcontext(struct rt_sigframe __user *frame, { struct sigcontext __user *sc = &frame->uc.uc_mcontext; struct __riscv_ctx_hdr __user *sc_ext_ptr = &sc->sc_extdesc.hdr; - long err; + struct arch_ext_priv *arch_ext; + long err, i, ext_size; /* sc_regs is structured the same as the start of pt_regs */ err = __copy_to_user(&sc->sc_regs, regs, sizeof(sc->sc_regs)); @@ -278,8 +286,20 @@ static long setup_sigcontext(struct rt_sigframe __user *frame, if (has_fpu()) err |= save_fp_state(regs, &sc->sc_fpregs); /* Save the vector state. */ - if ((has_vector() || has_xtheadvector()) && riscv_v_vstate_query(regs)) - err |= save_v_state(regs, (void __user **)&sc_ext_ptr); + for (i = 0; i < nr_arch_exts; i++) { + arch_ext = &arch_ext_list[i]; + if (!arch_ext->save) + continue; + + ext_size = arch_ext->save(regs, sc_ext_ptr + 1); + if (ext_size <= 0) { + err |= ext_size; + } else { + err |= __put_user(arch_ext->magic, &sc_ext_ptr->magic); + err |= __put_user(ext_size, &sc_ext_ptr->size); + sc_ext_ptr = (void *)sc_ext_ptr + ext_size; + } + } /* Write zero to fp-reserved space and check it on restore_sigcontext */ err |= __put_user(0, &sc->sc_extdesc.reserved); /* And put END __riscv_ctx_hdr at the end. */ -- cgit v1.2.3 From 1e6084d5c433b142b18d57694a6ab555ca6bb8cc Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Mon, 17 Nov 2025 21:19:27 -0700 Subject: riscv: mm: pmdp_huge_get_and_clear(): avoid atomic ops when !CONFIG_SMP When !CONFIG_SMP, there's no need for atomic operations in pmdp_huge_get_and_clear(), so, similar to what x86 does, let's not use atomics in this case. See also commit 546e42c8c6d94 ("riscv: Use an atomic xchg in pudp_huge_get_and_clear()"). Cc: Alexandre Ghiti Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 8bd36ac842eb..1df8a6adb407 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -997,7 +997,13 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { +#ifdef CONFIG_SMP pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); +#else + pmd_t pmd = *pmdp; + + pmd_clear(pmdp); +#endif page_table_check_pmd_clear(mm, pmd); -- cgit v1.2.3 From 425cc087fbaf267be7683b95481b46a058d63e49 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Mon, 17 Nov 2025 21:19:27 -0700 Subject: riscv: mm: ptep_get_and_clear(): avoid atomic ops when !CONFIG_SMP When !CONFIG_SMP, there's no need for atomic operations in ptep_get_and_clear(), so, similar to x86, let's not use atomics in this case. Cc: Alexandre Ghiti Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 1df8a6adb407..ebab8ecd78b2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -660,7 +660,13 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long a static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { +#ifdef CONFIG_SMP pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); +#else + pte_t pte = *ptep; + + set_pte(ptep, __pte(0)); +#endif page_table_check_pte_clear(mm, pte); -- cgit v1.2.3 From e0e51a0de02cf0e5008d0e167288ad1598005b9e Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Mon, 17 Nov 2025 21:19:28 -0700 Subject: riscv: mm: use xchg() on non-atomic_long_t variables, not atomic_long_xchg() Let's not call atomic_long_xchg() on something that's not an atomic_long_t, and just use xchg() instead. Continues the cleanup from commit 546e42c8c6d94 ("riscv: Use an atomic xchg in pudp_huge_get_and_clear()"), Cc: Alexandre Ghiti Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ebab8ecd78b2..6bb1f5bdc5d2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -661,7 +661,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { #ifdef CONFIG_SMP - pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); + pte_t pte = __pte(xchg(&ptep->pte, 0)); #else pte_t pte = *ptep; @@ -1004,7 +1004,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { #ifdef CONFIG_SMP - pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); + pmd_t pmd = __pmd(xchg(&pmdp->pmd, 0)); #else pmd_t pmd = *pmdp; -- cgit v1.2.3 From 3f0cbfb8a107a9f0a6e2184425b70ddc6d51f991 Mon Sep 17 00:00:00 2001 From: Pincheng Wang Date: Wed, 27 Aug 2025 00:29:36 +0800 Subject: riscv: add ISA extension parsing for Zilsd and Zclsd Add parsing for Zilsd and Zclsd ISA extensions which were ratified in commit f88abf1 ("Integrating load/store pair for RV32 with the main manual") of the riscv-isa-manual. Signed-off-by: Pincheng Wang Reviewed-by: Nutty Liu Link: https://patch.msgid.link/20250826162939.1494021-3-pincheng.plct@isrc.iscas.ac.cn [pjw@kernel.org: cleaned up checkpatch issues, whitespace; updated to apply] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/hwcap.h | 2 ++ arch/riscv/kernel/cpufeature.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index dfe57b215e6c..4369a2338541 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -108,6 +108,8 @@ #define RISCV_ISA_EXT_ZICBOP 99 #define RISCV_ISA_EXT_SVRSW60T59B 100 #define RISCV_ISA_EXT_ZALASR 101 +#define RISCV_ISA_EXT_ZILSD 102 +#define RISCV_ISA_EXT_ZCLSD 103 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index b057362f8fb5..c05b11596c19 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -242,6 +242,28 @@ static int riscv_ext_zcf_validate(const struct riscv_isa_ext_data *data, return -EPROBE_DEFER; } +static int riscv_ext_zilsd_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return -EINVAL; + + return 0; +} + +static int riscv_ext_zclsd_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return -EINVAL; + + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZILSD) && + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZCA)) + return 0; + + return -EPROBE_DEFER; +} + static int riscv_vector_f_validate(const struct riscv_isa_ext_data *data, const unsigned long *isa_bitmap) { @@ -484,6 +506,8 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA_VALIDATE(zcd, RISCV_ISA_EXT_ZCD, riscv_ext_zcd_validate), __RISCV_ISA_EXT_DATA_VALIDATE(zcf, RISCV_ISA_EXT_ZCF, riscv_ext_zcf_validate), __RISCV_ISA_EXT_DATA_VALIDATE(zcmop, RISCV_ISA_EXT_ZCMOP, riscv_ext_zca_depends), + __RISCV_ISA_EXT_DATA_VALIDATE(zclsd, RISCV_ISA_EXT_ZCLSD, riscv_ext_zclsd_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zilsd, RISCV_ISA_EXT_ZILSD, riscv_ext_zilsd_validate), __RISCV_ISA_EXT_DATA(zba, RISCV_ISA_EXT_ZBA), __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), __RISCV_ISA_EXT_DATA(zbc, RISCV_ISA_EXT_ZBC), -- cgit v1.2.3 From 6118ebed3bdf896038f58d0d1804f551f33e8643 Mon Sep 17 00:00:00 2001 From: Pincheng Wang Date: Wed, 27 Aug 2025 00:29:37 +0800 Subject: riscv: hwprobe: export Zilsd and Zclsd ISA extensions Export Zilsd and Zclsd ISA extensions through hwprobe. Signed-off-by: Pincheng Wang Reviewed-by: Nutty Liu Link: https://patch.msgid.link/20250826162939.1494021-4-pincheng.plct@isrc.iscas.ac.cn [pjw@kernel.org: fixed whitespace; updated to apply] Signed-off-by: Paul Walmsley --- arch/riscv/include/uapi/asm/hwprobe.h | 3 +++ arch/riscv/kernel/sys_hwprobe.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 1edea2331b8b..cd3c126730c3 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -84,6 +84,9 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_EXT_ZABHA (1ULL << 58) #define RISCV_HWPROBE_EXT_ZALASR (1ULL << 59) #define RISCV_HWPROBE_EXT_ZICBOP (1ULL << 60) +#define RISCV_HWPROBE_EXT_ZILSD (1ULL << 61) +#define RISCV_HWPROBE_EXT_ZCLSD (1ULL << 62) + #define RISCV_HWPROBE_KEY_CPUPERF_0 5 #define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0) #define RISCV_HWPROBE_MISALIGNED_EMULATED (1 << 0) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 0f701ace3bb9..e6787ba7f2fc 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -121,6 +121,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZBS); EXT_KEY(ZCA); EXT_KEY(ZCB); + EXT_KEY(ZCLSD); EXT_KEY(ZCMOP); EXT_KEY(ZICBOM); EXT_KEY(ZICBOP); @@ -130,6 +131,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZIHINTNTL); EXT_KEY(ZIHINTPAUSE); EXT_KEY(ZIHPM); + EXT_KEY(ZILSD); EXT_KEY(ZIMOP); EXT_KEY(ZKND); EXT_KEY(ZKNE); -- cgit v1.2.3 From f02dd254727665cc292669194b9171bb70413346 Mon Sep 17 00:00:00 2001 From: Zongmin Zhou Date: Thu, 20 Nov 2025 17:58:31 +0800 Subject: riscv/atomic.h: use RISCV_FULL_BARRIER in _arch_atomic* function. Replace the same code with the pre-defined macro RISCV_FULL_BARRIER to simplify the code. Signed-off-by: Zongmin Zhou Link: https://patch.msgid.link/20251120095831.64211-1-min_halo@163.com Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/atomic.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 5b96c2f61adb..3f33dc54f94b 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -203,7 +203,7 @@ ATOMIC_OPS(xor, xor, i) " add %[rc], %[p], %[a]\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ - " fence rw, rw\n" \ + RISCV_FULL_BARRIER \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : [a]"r" (_a), [u]"r" (_u) \ @@ -242,7 +242,7 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, " addi %[rc], %[p], 1\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ - " fence rw, rw\n" \ + RISCV_FULL_BARRIER \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : \ @@ -268,7 +268,7 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) " addi %[rc], %[p], -1\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ - " fence rw, rw\n" \ + RISCV_FULL_BARRIER \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : \ @@ -294,7 +294,7 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) " bltz %[rc], 1f\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ - " fence rw, rw\n" \ + RISCV_FULL_BARRIER \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : \ -- cgit v1.2.3 From 5efaf92da4365cb8d1ae6dd7a2d1245c69e09ff5 Mon Sep 17 00:00:00 2001 From: Himanshu Chauhan Date: Thu, 10 Jul 2025 18:22:30 +0530 Subject: riscv: Add SBI debug trigger extension and function ids Debug trigger extension is an SBI extension to support native debugging in S-mode and VS-mode. This patch adds the extension and the function IDs defined by the extension. Signed-off-by: Himanshu Chauhan Link: https://patch.msgid.link/20250710125231.653967-2-hchauhan@ventanamicro.com [pjw@kernel.org: updated to apply] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/sbi.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index ccc77a89b1e2..5725e0ca4dda 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -37,6 +37,7 @@ enum sbi_ext_id { SBI_EXT_NACL = 0x4E41434C, SBI_EXT_FWFT = 0x46574654, SBI_EXT_MPXY = 0x4D505859, + SBI_EXT_DBTR = 0x44425452, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -505,6 +506,34 @@ enum sbi_mpxy_rpmi_attribute_id { #define SBI_MPXY_CHAN_CAP_SEND_WITHOUT_RESP BIT(4) #define SBI_MPXY_CHAN_CAP_GET_NOTIFICATIONS BIT(5) +/* SBI debug triggers function IDs */ +enum sbi_ext_dbtr_fid { + SBI_EXT_DBTR_NUM_TRIGGERS = 0, + SBI_EXT_DBTR_SETUP_SHMEM, + SBI_EXT_DBTR_TRIG_READ, + SBI_EXT_DBTR_TRIG_INSTALL, + SBI_EXT_DBTR_TRIG_UPDATE, + SBI_EXT_DBTR_TRIG_UNINSTALL, + SBI_EXT_DBTR_TRIG_ENABLE, + SBI_EXT_DBTR_TRIG_DISABLE, +}; + +struct sbi_dbtr_data_msg { + unsigned long tstate; + unsigned long tdata1; + unsigned long tdata2; + unsigned long tdata3; +}; + +struct sbi_dbtr_id_msg { + unsigned long idx; +}; + +union sbi_dbtr_shmem_entry { + struct sbi_dbtr_data_msg data; + struct sbi_dbtr_id_msg id; +}; + /* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 -- cgit v1.2.3 From 987697749def9c5e10d9a2d992f012db61ae1967 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Wed, 19 Nov 2025 18:51:22 +0100 Subject: arm64: dts: mediatek: mt7986: add dtbs with applied overlays for bpi-r3 Build devicetree binaries for testing overlays and providing users full dtb without using overlays. Suggested-by: Rob Herring Signed-off-by: Frank Wunderlich Fixes: a58c36806741 ("arm64: dts: mediatek: mt7988a-bpi-r4pro: Add mmc overlays") Fixes: dec929e61a42 ("arm64: dts: mediatek: mt7988a-bpi-r4-pro: Add PCIe overlays") Fixes: 714a80ced07a ("arm64: dts: mediatek: mt7988a-bpi-r4: Add dt overlays for sd + emmc") Fixes: 312189ebb802 ("arm64: dts: mt7986: add overlay for SATA power socket on BPI-R3") Fixes: 8e01fb15b815 ("arm64: dts: mt7986: add Bananapi R3") Acked-by: AngeloGioacchino Del Regno Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20251119175124.48947-2-linux@fw-web.de Signed-off-by: Rob Herring (Arm) --- arch/arm64/boot/dts/mediatek/Makefile | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/mediatek/Makefile b/arch/arm64/boot/dts/mediatek/Makefile index c5fd6191a925..77d76730d61b 100644 --- a/arch/arm64/boot/dts/mediatek/Makefile +++ b/arch/arm64/boot/dts/mediatek/Makefile @@ -19,6 +19,27 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-nand.dtbo dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-nor.dtbo dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-sata.dtbo dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-sd.dtbo +mt7986a-bananapi-bpi-r3-emmc-nand-dtbs := \ + mt7986a-bananapi-bpi-r3.dtb \ + mt7986a-bananapi-bpi-r3-emmc.dtbo \ + mt7986a-bananapi-bpi-r3-nand.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-emmc-nand.dtb +mt7986a-bananapi-bpi-r3-emmc-nor-dtbs := \ + mt7986a-bananapi-bpi-r3.dtb \ + mt7986a-bananapi-bpi-r3-emmc.dtbo \ + mt7986a-bananapi-bpi-r3-nor.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-emmc-nor.dtb +mt7986a-bananapi-bpi-r3-sd-nand-dtbs := \ + mt7986a-bananapi-bpi-r3.dtb \ + mt7986a-bananapi-bpi-r3-sd.dtbo \ + mt7986a-bananapi-bpi-r3-nand.dtbo \ + mt7986a-bananapi-bpi-r3-sata.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-sd-nand.dtb +mt7986a-bananapi-bpi-r3-sd-nor-dtbs := \ + mt7986a-bananapi-bpi-r3.dtb \ + mt7986a-bananapi-bpi-r3-sd.dtbo \ + mt7986a-bananapi-bpi-r3-nor.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-bananapi-bpi-r3-sd-nor.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986a-rfb.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt7986b-rfb.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4.dtb -- cgit v1.2.3 From 0773bc6ab7ec0b707632c991fe29edf28f03a641 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Wed, 19 Nov 2025 18:51:23 +0100 Subject: arm64: dts: mediatek: mt7988: add dtbs with applied overlays for bpi-r4 (pro) Build devicetree binaries for testing overlays and providing users full dtb without using overlays for Bananapi R4 (pro) variants. Signed-off-by: Frank Wunderlich Link: https://patch.msgid.link/20251119175124.48947-3-linux@fw-web.de Signed-off-by: Rob Herring (Arm) --- arch/arm64/boot/dts/mediatek/Makefile | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/mediatek/Makefile b/arch/arm64/boot/dts/mediatek/Makefile index 77d76730d61b..cac8f4c6d76f 100644 --- a/arch/arm64/boot/dts/mediatek/Makefile +++ b/arch/arm64/boot/dts/mediatek/Makefile @@ -52,6 +52,38 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-cn18.dtbo dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-emmc.dtbo dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-sd.dtbo dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-sd.dtbo +mt7988a-bananapi-bpi-r4-emmc-dtbs := \ + mt7988a-bananapi-bpi-r4.dtb \ + mt7988a-bananapi-bpi-r4-emmc.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-emmc.dtb +mt7988a-bananapi-bpi-r4-sd-dtbs := \ + mt7988a-bananapi-bpi-r4.dtb \ + mt7988a-bananapi-bpi-r4-sd.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-sd.dtb +mt7988a-bananapi-bpi-r4-2g5-emmc-dtbs := \ + mt7988a-bananapi-bpi-r4-2g5.dtb \ + mt7988a-bananapi-bpi-r4-emmc.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-2g5-emmc.dtb +mt7988a-bananapi-bpi-r4-2g5-sd-dtbs := \ + mt7988a-bananapi-bpi-r4-2g5.dtb \ + mt7988a-bananapi-bpi-r4-sd.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-2g5-sd.dtb +mt7988a-bananapi-bpi-r4-pro-8x-emmc-dtbs := \ + mt7988a-bananapi-bpi-r4-pro-8x.dtb \ + mt7988a-bananapi-bpi-r4-pro-emmc.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-8x-emmc.dtb +mt7988a-bananapi-bpi-r4-pro-8x-sd-dtbs := \ + mt7988a-bananapi-bpi-r4-pro-8x.dtb \ + mt7988a-bananapi-bpi-r4-pro-sd.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-8x-sd.dtb +mt7988a-bananapi-bpi-r4-pro-8x-sd-cn15-dtbs := \ + mt7988a-bananapi-bpi-r4-pro-8x-sd.dtb \ + mt7988a-bananapi-bpi-r4-pro-cn15.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-8x-sd-cn15.dtb +mt7988a-bananapi-bpi-r4-pro-8x-sd-cn18-dtbs := \ + mt7988a-bananapi-bpi-r4-pro-8x-sd.dtb \ + mt7988a-bananapi-bpi-r4-pro-cn18.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt7988a-bananapi-bpi-r4-pro-8x-sd-cn18.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8167-pumpkin.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8173-elm.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8173-elm-hana.dtb -- cgit v1.2.3 From ce7b1d58609abc2941a1f38094147f439fb74233 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 5 Dec 2025 22:59:38 +0100 Subject: arm64: dts: mediatek: Apply mt8395-radxa DT overlay at build time It's a requirement that DT overlays be applied at build time in order to validate them as overlays are not validated on their own. Add missing target for mt8395-radxa hd panel overlay. Fixes: 4c8ff61199a7 ("arm64: dts: mediatek: mt8395-radxa-nio-12l: Add Radxa 8 HD panel") Signed-off-by: Frank Wunderlich Acked-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20251205215940.19287-1-linux@fw-web.de Signed-off-by: Rob Herring (Arm) --- arch/arm64/boot/dts/mediatek/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/mediatek/Makefile b/arch/arm64/boot/dts/mediatek/Makefile index cac8f4c6d76f..3f76d9ce9879 100644 --- a/arch/arm64/boot/dts/mediatek/Makefile +++ b/arch/arm64/boot/dts/mediatek/Makefile @@ -166,6 +166,8 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += mt8390-grinn-genio-700-sbc.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8395-kontron-3-5-sbc-i1200.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8395-radxa-nio-12l.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8395-radxa-nio-12l-8-hd-panel.dtbo +mt8395-radxa-nio-12l-8-hd-panel-dtbs := mt8395-radxa-nio-12l.dtb mt8395-radxa-nio-12l-8-hd-panel.dtbo +dtb-$(CONFIG_ARCH_MEDIATEK) += mt8395-radxa-nio-12l-8-hd-panel.dtb dtb-$(CONFIG_ARCH_MEDIATEK) += mt8516-pumpkin.dtb # Device tree overlays support -- cgit v1.2.3 From 91ff28ae6d050e0ca01ac13eb8ba31d744cf672f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Dec 2025 11:20:07 +0000 Subject: x86/irqflags: Use ASM_OUTPUT_RM in native_save_fl() clang is generating very inefficient code for native_save_fl() which is used for local_irq_save() in critical spots. Allowing the "pop %0" to use memory: 1) forces the compiler to add annoying stack canaries when CONFIG_STACKPROTECTOR_STRONG=y in many places. 2) Almost always is followed by an immediate "move memory,register" One good example is _raw_spin_lock_irqsave, with 8 extra instructions ffffffff82067a30 <_raw_spin_lock_irqsave>: ffffffff82067a30: ... ffffffff82067a39: 53 push %rbx // Three instructions to ajust the stack, read the per-cpu canary // and copy it to 8(%rsp) ffffffff82067a3a: 48 83 ec 10 sub $0x10,%rsp ffffffff82067a3e: 65 48 8b 05 da 15 45 02 mov %gs:0x24515da(%rip),%rax # <__stack_chk_guard> ffffffff82067a46: 48 89 44 24 08 mov %rax,0x8(%rsp) ffffffff82067a4b: 9c pushf // instead of pop %rbx, compiler uses 2 instructions. ffffffff82067a4c: 8f 04 24 pop (%rsp) ffffffff82067a4f: 48 8b 1c 24 mov (%rsp),%rbx ffffffff82067a53: fa cli ffffffff82067a54: b9 01 00 00 00 mov $0x1,%ecx ffffffff82067a59: 31 c0 xor %eax,%eax ffffffff82067a5b: f0 0f b1 0f lock cmpxchg %ecx,(%rdi) ffffffff82067a5f: 75 1d jne ffffffff82067a7e <_raw_spin_lock_irqsave+0x4e> // three instructions to check the stack canary ffffffff82067a61: 65 48 8b 05 b7 15 45 02 mov %gs:0x24515b7(%rip),%rax # <__stack_chk_guard> ffffffff82067a69: 48 3b 44 24 08 cmp 0x8(%rsp),%rax ffffffff82067a6e: 75 17 jne ffffffff82067a87 ... // One extra instruction to adjust the stack. ffffffff82067a73: 48 83 c4 10 add $0x10,%rsp ... // One more instruction in case the stack was mangled. ffffffff82067a87: e8 a4 35 ff ff call ffffffff8205b030 <__stack_chk_fail> This patch changes nothing for gcc, but for clang saves ~20000 bytes of text even though more functions are inlined. $ size vmlinux.gcc.before vmlinux.gcc.after vmlinux.clang.before vmlinux.clang.after text data bss dec hex filename 45565821 25005462 4704800 75276083 47c9f33 vmlinux.gcc.before 45565821 25005462 4704800 75276083 47c9f33 vmlinux.gcc.after 45121072 24638617 5533040 75292729 47ce039 vmlinux.clang.before 45093887 24638633 5536808 75269328 47c84d0 vmlinux.clang.after $ scripts/bloat-o-meter -t vmlinux.clang.before vmlinux.clang.after add/remove: 1/2 grow/shrink: 21/533 up/down: 2250/-22112 (-19862) Signed-off-by: Eric Dumazet Cc: Uros Bizjak Signed-off-by: Linus Torvalds --- arch/x86/include/asm/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index b30e5474c18e..a1193e9d65f2 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -25,7 +25,7 @@ extern __always_inline unsigned long native_save_fl(void) */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" - : "=rm" (flags) + : ASM_OUTPUT_RM (flags) : /* no input */ : "memory"); -- cgit v1.2.3 From c2296a1e42418556efbeb5636c4fa6aa6106713a Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A." Date: Tue, 28 Oct 2025 16:25:12 +0530 Subject: powerpc/kexec: Enable SMT before waking offline CPUs If SMT is disabled or a partial SMT state is enabled, when a new kernel image is loaded for kexec, on reboot the following warning is observed: kexec: Waking offline cpu 228. WARNING: CPU: 0 PID: 9062 at arch/powerpc/kexec/core_64.c:223 kexec_prepare_cpus+0x1b0/0x1bc [snip] NIP kexec_prepare_cpus+0x1b0/0x1bc LR kexec_prepare_cpus+0x1a0/0x1bc Call Trace: kexec_prepare_cpus+0x1a0/0x1bc (unreliable) default_machine_kexec+0x160/0x19c machine_kexec+0x80/0x88 kernel_kexec+0xd0/0x118 __do_sys_reboot+0x210/0x2c4 system_call_exception+0x124/0x320 system_call_vectored_common+0x15c/0x2ec This occurs as add_cpu() fails due to cpu_bootable() returning false for CPUs that fail the cpu_smt_thread_allowed() check or non primary threads if SMT is disabled. Fix the issue by enabling SMT and resetting the number of SMT threads to the number of threads per core, before attempting to wake up all present CPUs. Fixes: 38253464bc82 ("cpu/SMT: Create topology_smt_thread_allowed()") Reported-by: Sachin P Bappalige Cc: stable@vger.kernel.org # v6.6+ Reviewed-by: Srikar Dronamraju Signed-off-by: Nysal Jan K.A. Tested-by: Samir M Reviewed-by: Sourabh Jain Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251028105516.26258-1-nysal@linux.ibm.com --- arch/powerpc/kexec/core_64.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 222aa326dace..825ab8a88f18 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -202,6 +202,23 @@ static void kexec_prepare_cpus_wait(int wait_state) mb(); } + +/* + * The add_cpu() call in wake_offline_cpus() can fail as cpu_bootable() + * returns false for CPUs that fail the cpu_smt_thread_allowed() check + * or non primary threads if SMT is disabled. Re-enable SMT and set the + * number of SMT threads to threads per core. + */ +static void kexec_smt_reenable(void) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) + lock_device_hotplug(); + cpu_smt_num_threads = threads_per_core; + cpu_smt_control = CPU_SMT_ENABLED; + unlock_device_hotplug(); +#endif +} + /* * We need to make sure each present CPU is online. The next kernel will scan * the device tree and assume primary threads are online and query secondary @@ -216,6 +233,8 @@ static void wake_offline_cpus(void) { int cpu = 0; + kexec_smt_reenable(); + for_each_present_cpu(cpu) { if (!cpu_online(cpu)) { printk(KERN_INFO "kexec: Waking offline cpu %d.\n", -- cgit v1.2.3 From f1164534ad62f0cc247d99650b07bd59ad2a49fd Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Tue, 23 Sep 2025 17:32:16 +0200 Subject: powerpc/tools: drop `-o pipefail` in gcc check scripts Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for -fpatchable-function-entry") Fixes: b71c9ffb1405 ("powerpc: Add arch/powerpc/tools directory") Reported-by: Joe Lawrence Acked-by: Joe Lawrence Signed-off-by: Jan Stancek Fixes: 8c50b72a3b4f ("powerpc/ftrace: Add Kconfig & Make glue for mprofile-kernel") Fixes: abba759796f9 ("powerpc/kbuild: move -mprofile-kernel check to Kconfig") Tested-by: Justin M. Forbes Reviewed-by: Naveen N Rao (AMD) Reviewed-by: Josh Poimboeuf Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/cc6cdd116c3ad9d990df21f13c6d8e8a83815bbd.1758641374.git.jstancek@redhat.com --- arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh | 1 - arch/powerpc/tools/gcc-check-mprofile-kernel.sh | 1 - 2 files changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh b/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh index 06706903503b..baed467a016b 100755 --- a/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh +++ b/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 set -e -set -o pipefail # To debug, uncomment the following line # set -x diff --git a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh index 73e331e7660e..6193b0ed0c77 100755 --- a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh +++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 set -e -set -o pipefail # To debug, uncomment the following line # set -x -- cgit v1.2.3 From b94b73567561642323617155bf4ee24ef0d258fe Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Mon, 10 Nov 2025 10:30:22 +1100 Subject: powerpc: Add reloc_offset() to font bitmap pointer used for bootx_printf() Since Linux v6.7, booting using BootX on an Old World PowerMac produces an early crash. Stan Johnson writes, "the symptoms are that the screen goes blank and the backlight stays on, and the system freezes (Linux doesn't boot)." Further testing revealed that the failure can be avoided by disabling CONFIG_BOOTX_TEXT. Bisection revealed that the regression was caused by a change to the font bitmap pointer that's used when btext_init() begins painting characters on the display, early in the boot process. Christophe Leroy explains, "before kernel text is relocated to its final location ... data is addressed with an offset which is added to the Global Offset Table (GOT) entries at the start of bootx_init() by function reloc_got2(). But the pointers that are located inside a structure are not referenced in the GOT and are therefore not updated by reloc_got2(). It is therefore needed to apply the offset manually by using PTRRELOC() macro." Cc: stable@vger.kernel.org Link: https://lists.debian.org/debian-powerpc/2025/10/msg00111.html Link: https://lore.kernel.org/linuxppc-dev/d81ddca8-c5ee-d583-d579-02b19ed95301@yahoo.com/ Reported-by: Cedar Maxwell Closes: https://lists.debian.org/debian-powerpc/2025/09/msg00031.html Bisected-by: Stan Johnson Tested-by: Stan Johnson Fixes: 0ebc7feae79a ("powerpc: Use shared font data") Suggested-by: Christophe Leroy Signed-off-by: Finn Thain Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/22b3b247425a052b079ab84da926706b3702c2c7.1762731022.git.fthain@linux-m68k.org --- arch/powerpc/kernel/btext.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 7f63f1cdc6c3..ca00c4824e31 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -20,6 +20,7 @@ #include #include #include +#include #define NO_SCROLL @@ -463,7 +464,7 @@ static noinline void draw_byte(unsigned char c, long locX, long locY) { unsigned char *base = calc_base(locX << 3, locY << 4); unsigned int font_index = c * 16; - const unsigned char *font = font_sun_8x16.data + font_index; + const unsigned char *font = PTRRELOC(font_sun_8x16.data) + font_index; int rb = dispDeviceRowBytes; rmci_maybe_on(); -- cgit v1.2.3 From fbe409d138b1d8a8b91cdad19cf95495e8ebe1ee Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Mon, 8 Sep 2025 14:21:23 +0530 Subject: powerpc/powernv: Enable cpuidle state detection for POWER11 Extend cpuidle state detection to POWER11 by updating the PVR check. This ensures POWER11 correctly recognizes supported stop states, similar to POWER9 and POWER10. Without Patch: (Power11 - PowerNV systems) CPUidle driver: powernv_idle CPUidle governor: menu analyzing CPU 927: Number of idle states: 1 Available idle states: snooze snooze: Flags/Description: snooze Latency: 0 Usage: 251631 Duration: 207497715900 -- With Patch: (Power11 - PowerNV systems) CPUidle driver: powernv_idle CPUidle governor: menu analyzing CPU 959: Number of idle states: 4 Available idle states: snooze stop0_lite stop0 stop3 snooze: Flags/Description: snooze Latency: 0 Usage: 2 Duration: 33 stop0_lite: Flags/Description: stop0_lite Latency: 1 Usage: 1 Duration: 52 stop0: Flags/Description: stop0 Latency: 10 Usage: 13 Duration: 1920 stop3: Flags/Description: stop3 Latency: 45 Usage: 381 Duration: 21638478 Signed-off-by: Aboorva Devarajan Tested-by: Madadi Vineeth Reddy Reviewed-by: Madadi Vineeth Reddy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250908085123.216780-1-aboorvad@linux.ibm.com --- arch/powerpc/platforms/powernv/idle.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index d98b933e4984..e4f4e907f6e3 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -1171,8 +1171,9 @@ static void __init pnv_arch300_idle_init(void) u64 max_residency_ns = 0; int i; - /* stop is not really architected, we only have p9,p10 drivers */ - if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9)) + /* stop is not really architected, we only have p9,p10 and p11 drivers */ + if (!pvr_version_is(PVR_POWER9) && !pvr_version_is(PVR_POWER10) && + !pvr_version_is(PVR_POWER11)) return; /* @@ -1189,8 +1190,8 @@ static void __init pnv_arch300_idle_init(void) struct pnv_idle_states_t *state = &pnv_idle_states[i]; u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK; - /* No deep loss driver implemented for POWER10 yet */ - if (pvr_version_is(PVR_POWER10) && + /* No deep loss driver implemented for POWER10 and POWER11 yet */ + if ((pvr_version_is(PVR_POWER10) || pvr_version_is(PVR_POWER11)) && state->flags & (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT)) continue; -- cgit v1.2.3 From 608328ba5b0619cbc28b409296b5e3840bcb97b6 Mon Sep 17 00:00:00 2001 From: "Christophe Leroy (CS GROUP)" Date: Fri, 19 Dec 2025 13:23:52 +0100 Subject: powerpc/32: Restore disabling of interrupts at interrupt/syscall exit Commit 2997876c4a1a ("powerpc/32: Restore clearing of MSR[RI] at interrupt/syscall exit") delayed clearing of MSR[RI], but missed that both MSR[RI] and MSR[EE] are cleared at the same time, so the commit also delayed the disabling of interrupts, leading to unexpected behaviour. To fix that, mostly revert the blamed commit and restore the clearing of MSR[RI] in interrupt_exit_kernel_prepare() instead. For 8xx it implies adding a synchronising instruction after the mtspr in order to make sure no instruction counter interrupt (used for perf events) will fire just after clearing MSR[RI]. Reported-by: Christian Zigotzky Closes: https://lore.kernel.org/all/4d0bd05d-6158-1323-3509-744d3fbe8fc7@xenosoft.de/ Reported-by: Guenter Roeck Closes: https://lore.kernel.org/all/6b05eb1c-fdef-44e0-91a7-8286825e68f1@roeck-us.net/ Fixes: 2997876c4a1a ("powerpc/32: Restore clearing of MSR[RI] at interrupt/syscall exit") Signed-off-by: Christophe Leroy (CS GROUP) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/585ea521b2be99d293b539bbfae148366cfb3687.1766146895.git.chleroy@kernel.org --- arch/powerpc/include/asm/hw_irq.h | 2 +- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kernel/entry_32.S | 15 --------------- arch/powerpc/kernel/interrupt.c | 5 ++++- 4 files changed, 6 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 1078ba88efaf..9cd945f2acaf 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -90,7 +90,7 @@ static inline void __hard_EE_RI_disable(void) if (IS_ENABLED(CONFIG_BOOKE)) wrtee(0); else if (IS_ENABLED(CONFIG_PPC_8xx)) - wrtspr(SPRN_NRI); + wrtspr_sync(SPRN_NRI); else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) __mtmsrd(0, 1); else diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 3fe186635432..3449dd2b577d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1400,6 +1400,7 @@ static inline void mtmsr_isync(unsigned long val) : "r" ((unsigned long)(v)) \ : "memory") #define wrtspr(rn) asm volatile("mtspr " __stringify(rn) ",2" : : : "memory") +#define wrtspr_sync(rn) asm volatile("mtspr " __stringify(rn) ",2; sync" : : : "memory") static inline void wrtee(unsigned long val) { diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 16f8ee6cb2cd..d8426251b1cd 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -101,17 +101,6 @@ SYM_FUNC_END(__kuep_unlock) .endm #endif -.macro clr_ri trash -#ifndef CONFIG_BOOKE -#ifdef CONFIG_PPC_8xx - mtspr SPRN_NRI, \trash -#else - li \trash, MSR_KERNEL & ~MSR_RI - mtmsr \trash -#endif -#endif -.endm - .globl transfer_to_syscall transfer_to_syscall: stw r3, ORIG_GPR3(r1) @@ -160,7 +149,6 @@ ret_from_syscall: cmpwi r3,0 REST_GPR(3, r1) syscall_exit_finish: - clr_ri r4 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 @@ -237,7 +225,6 @@ fast_exception_return: /* Clear the exception marker on the stack to avoid confusing stacktrace */ li r10, 0 stw r10, 8(r11) - clr_ri r10 mtspr SPRN_SRR1,r9 mtspr SPRN_SRR0,r12 REST_GPR(9, r11) @@ -270,7 +257,6 @@ interrupt_return: .Lfast_user_interrupt_return: lwz r11,_NIP(r1) lwz r12,_MSR(r1) - clr_ri r4 mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r12 @@ -313,7 +299,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) cmpwi cr1,r3,0 lwz r11,_NIP(r1) lwz r12,_MSR(r1) - clr_ri r4 mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r12 diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index aea6f7e8e9c6..e63bfde13e03 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -38,7 +38,7 @@ static inline bool exit_must_hard_disable(void) #else static inline bool exit_must_hard_disable(void) { - return false; + return true; } #endif @@ -443,6 +443,9 @@ again: if (unlikely(stack_store)) __hard_EE_RI_disable(); +#else + } else { + __hard_EE_RI_disable(); #endif /* CONFIG_PPC64 */ } -- cgit v1.2.3