diff options
Diffstat (limited to 'arch')
78 files changed, 850 insertions, 760 deletions
diff --git a/arch/alpha/include/asm/hwrpb.h b/arch/alpha/include/asm/hwrpb.h index fc76f36265ad..db831cf8de10 100644 --- a/arch/alpha/include/asm/hwrpb.h +++ b/arch/alpha/include/asm/hwrpb.h @@ -135,7 +135,7 @@ struct crb_struct { /* virtual->physical map */ unsigned long map_entries; unsigned long map_pages; - struct vf_map_struct map[1]; + struct vf_map_struct map[]; }; struct memclust_struct { diff --git a/arch/alpha/include/uapi/asm/ptrace.h b/arch/alpha/include/uapi/asm/ptrace.h index 5ca45934fcbb..72ed913a910f 100644 --- a/arch/alpha/include/uapi/asm/ptrace.h +++ b/arch/alpha/include/uapi/asm/ptrace.h @@ -42,6 +42,8 @@ struct pt_regs { unsigned long trap_a0; unsigned long trap_a1; unsigned long trap_a2; +/* This makes the stack 16-byte aligned as GCC expects */ + unsigned long __pad0; /* These are saved by PAL-code: */ unsigned long ps; unsigned long pc; diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index 4cfeae42c79a..e9dad60b147f 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -19,9 +19,13 @@ static void __used foo(void) DEFINE(TI_STATUS, offsetof(struct thread_info, status)); BLANK(); + DEFINE(SP_OFF, offsetof(struct pt_regs, ps)); DEFINE(SIZEOF_PT_REGS, sizeof(struct pt_regs)); BLANK(); + DEFINE(SWITCH_STACK_SIZE, sizeof(struct switch_stack)); + BLANK(); + DEFINE(HAE_CACHE, offsetof(struct alpha_machine_vector, hae_cache)); DEFINE(HAE_REG, offsetof(struct alpha_machine_vector, hae_register)); } diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index dd26062d75b3..f4d41b4538c2 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -15,10 +15,6 @@ .set noat .cfi_sections .debug_frame -/* Stack offsets. */ -#define SP_OFF 184 -#define SWITCH_STACK_SIZE 64 - .macro CFI_START_OSF_FRAME func .align 4 .globl \func @@ -198,8 +194,8 @@ CFI_END_OSF_FRAME entArith CFI_START_OSF_FRAME entMM SAVE_ALL /* save $9 - $15 so the inline exception code can manipulate them. */ - subq $sp, 56, $sp - .cfi_adjust_cfa_offset 56 + subq $sp, 64, $sp + .cfi_adjust_cfa_offset 64 stq $9, 0($sp) stq $10, 8($sp) stq $11, 16($sp) @@ -214,7 +210,7 @@ CFI_START_OSF_FRAME entMM .cfi_rel_offset $13, 32 .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 - addq $sp, 56, $19 + addq $sp, 64, $19 /* handle the fault */ lda $8, 0x3fff bic $sp, $8, $8 @@ -227,7 +223,7 @@ CFI_START_OSF_FRAME entMM ldq $13, 32($sp) ldq $14, 40($sp) ldq $15, 48($sp) - addq $sp, 56, $sp + addq $sp, 64, $sp .cfi_restore $9 .cfi_restore $10 .cfi_restore $11 @@ -235,7 +231,7 @@ CFI_START_OSF_FRAME entMM .cfi_restore $13 .cfi_restore $14 .cfi_restore $15 - .cfi_adjust_cfa_offset -56 + .cfi_adjust_cfa_offset -64 /* finish up the syscall as normal. */ br ret_from_sys_call CFI_END_OSF_FRAME entMM @@ -382,8 +378,8 @@ entUnaUser: .cfi_restore $0 .cfi_adjust_cfa_offset -256 SAVE_ALL /* setup normal kernel stack */ - lda $sp, -56($sp) - .cfi_adjust_cfa_offset 56 + lda $sp, -64($sp) + .cfi_adjust_cfa_offset 64 stq $9, 0($sp) stq $10, 8($sp) stq $11, 16($sp) @@ -399,7 +395,7 @@ entUnaUser: .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 lda $8, 0x3fff - addq $sp, 56, $19 + addq $sp, 64, $19 bic $sp, $8, $8 jsr $26, do_entUnaUser ldq $9, 0($sp) @@ -409,7 +405,7 @@ entUnaUser: ldq $13, 32($sp) ldq $14, 40($sp) ldq $15, 48($sp) - lda $sp, 56($sp) + lda $sp, 64($sp) .cfi_restore $9 .cfi_restore $10 .cfi_restore $11 @@ -417,7 +413,7 @@ entUnaUser: .cfi_restore $13 .cfi_restore $14 .cfi_restore $15 - .cfi_adjust_cfa_offset -56 + .cfi_adjust_cfa_offset -64 br ret_from_sys_call CFI_END_OSF_FRAME entUna diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 681f56089d9c..dc91de50f906 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -13,6 +13,7 @@ #include <linux/log2.h> #include <linux/dma-map-ops.h> #include <linux/iommu-helper.h> +#include <linux/string_choices.h> #include <asm/io.h> #include <asm/hwrpb.h> @@ -212,7 +213,7 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask) /* If both conditions above are met, we are fine. */ DBGA("pci_dac_dma_supported %s from %ps\n", - ok ? "yes" : "no", __builtin_return_address(0)); + str_yes_no(ok), __builtin_return_address(0)); return ok; } diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index a9a38c80c4a7..7004397937cf 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -649,7 +649,7 @@ s_reg_to_mem (unsigned long s_reg) static int unauser_reg_offsets[32] = { R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), /* r9 ... r15 are stored in front of regs. */ - -56, -48, -40, -32, -24, -16, -8, + -64, -56, -48, -40, -32, -24, -16, /* padding at -8 */ R(r16), R(r17), R(r18), R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), R(r27), R(r28), R(gp), diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 8c9850437e67..a9816bbc9f34 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -78,8 +78,8 @@ __load_new_mm_context(struct mm_struct *next_mm) /* Macro for exception fixup code to access integer registers. */ #define dpf_reg(r) \ - (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \ - (r) <= 18 ? (r)+10 : (r)-10]) + (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-17 : \ + (r) <= 18 ? (r)+11 : (r)-10]) asmlinkage void do_page_fault(unsigned long address, unsigned long mmcsr, diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fcdd0ed3eca8..940343beb3d4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -225,7 +225,6 @@ config ARM64 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_GCC_PLUGINS select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \ HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 358c68565bfd..2b25d671365f 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -48,7 +48,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU) \ KBUILD_CFLAGS += $(call cc-disable-warning, psabi) KBUILD_AFLAGS += $(compat_vdso) +ifeq ($(call test-ge, $(CONFIG_RUSTC_VERSION), 108500),y) +KBUILD_RUSTFLAGS += --target=aarch64-unknown-none-softfloat +else KBUILD_RUSTFLAGS += --target=aarch64-unknown-none -Ctarget-feature="-neon" +endif KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 47f2cf408eed..78ec1ef2cfe8 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -605,48 +605,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) __cpacr_to_cptr_set(clr, set));\ } while (0) -static __always_inline void kvm_write_cptr_el2(u64 val) -{ - if (has_vhe() || has_hvhe()) - write_sysreg(val, cpacr_el1); - else - write_sysreg(val, cptr_el2); -} - -/* Resets the value of cptr_el2 when returning to the host. */ -static __always_inline void __kvm_reset_cptr_el2(struct kvm *kvm) -{ - u64 val; - - if (has_vhe()) { - val = (CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN); - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - } else if (has_hvhe()) { - val = CPACR_EL1_FPEN; - - if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - } else { - val = CPTR_NVHE_EL2_RES1; - - if (kvm_has_sve(kvm) && guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - } - - kvm_write_cptr_el2(val); -} - -#ifdef __KVM_NVHE_HYPERVISOR__ -#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2(kern_hyp_va((v)->kvm)) -#else -#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2((v)->kvm) -#endif - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7cfa024de4e3..3a7ec98ef123 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -100,7 +100,7 @@ static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, void *(*to_va)(phys_addr_t phys)) { - phys_addr_t *p = to_va(mc->head); + phys_addr_t *p = to_va(mc->head & PAGE_MASK); if (!mc->nr_pages) return NULL; @@ -615,8 +615,6 @@ struct cpu_sve_state { struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 -#define KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED 2 -#define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 unsigned long flags; @@ -624,23 +622,13 @@ struct kvm_host_data { struct kvm_cpu_context host_ctxt; /* - * All pointers in this union are hyp VA. + * Hyp VA. * sve_state is only used in pKVM and if system_supports_sve(). */ - union { - struct user_fpsimd_state *fpsimd_state; - struct cpu_sve_state *sve_state; - }; - - union { - /* HYP VA pointer to the host storage for FPMR */ - u64 *fpmr_ptr; - /* - * Used by pKVM only, as it needs to provide storage - * for the host - */ - u64 fpmr; - }; + struct cpu_sve_state *sve_state; + + /* Used by pKVM only. */ + u64 fpmr; /* Ownership of the FP regs */ enum { diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fd..309942b06c5b 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4eb7c6698ae4..d561cf3b8ac7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3091,6 +3091,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -3180,8 +3181,6 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM8), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM4), HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), @@ -3192,6 +3191,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), #ifdef CONFIG_ARM64_POE diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 2b601d88762d..8370d55f0353 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1695,31 +1695,6 @@ void fpsimd_signal_preserve_current_state(void) } /* - * Called by KVM when entering the guest. - */ -void fpsimd_kvm_prepare(void) -{ - if (!system_supports_sve()) - return; - - /* - * KVM does not save host SVE state since we can only enter - * the guest from a syscall so the ABI means that only the - * non-saved SVE state needs to be saved. If we have left - * SVE enabled for performance reasons then update the task - * state to be FPSIMD only. - */ - get_cpu_fpsimd_context(); - - if (test_and_clear_thread_flag(TIF_SVE)) { - sve_to_fpsimd(current); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - put_cpu_fpsimd_context(); -} - -/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f8..cb180684d10d 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -194,12 +194,19 @@ static void amu_fie_setup(const struct cpumask *cpus) int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { + for_each_cpu(cpu, cpus) if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +244,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 4ec32e86a8da..47ad6944f9f0 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -41,6 +41,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f84c71f04d9e..e73326bd3ff7 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -162,6 +162,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 231c0cd9c7b4..70802e4c91cf 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -447,21 +447,19 @@ static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { - int ret; - kvm_timer_update_status(timer_ctx, new_level); timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, - timer_irq(timer_ctx), - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } + if (userspace_irqchip(vcpu->kvm)) + return; + + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + timer_irq(timer_ctx), + timer_ctx->irq.level, + timer_ctx); } /* Only called for a fully emulated timer */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 071a7d75be68..b8e55a441282 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2481,14 +2481,6 @@ static void finalize_init_hyp_mode(void) per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = kern_hyp_va(sve_state); } - } else { - for_each_possible_cpu(cpu) { - struct user_fpsimd_state *fpsimd_state; - - fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; - per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = - kern_hyp_va(fpsimd_state); - } } } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 4d3d1a2eb157..7f6e43d25691 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -54,50 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (!system_supports_fpsimd()) return; - fpsimd_kvm_prepare(); - /* - * We will check TIF_FOREIGN_FPSTATE just before entering the - * guest in kvm_arch_vcpu_ctxflush_fp() and override this to - * FP_STATE_FREE if the flag set. + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such + * that the host kernel is responsible for restoring this state upon + * return to userspace, and the hyp code doesn't need to save anything. + * + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures + * that PSTATE.{SM,ZA} == {0,0}. */ - *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; - *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); - *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); - - host_data_clear_flag(HOST_SVE_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - host_data_set_flag(HOST_SVE_ENABLED); - - if (system_supports_sme()) { - host_data_clear_flag(HOST_SME_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - host_data_set_flag(HOST_SME_ENABLED); - - /* - * If PSTATE.SM is enabled then save any pending FP - * state and disable PSTATE.SM. If we leave PSTATE.SM - * enabled and the guest does not enable SME via - * CPACR_EL1.SMEN then operations that should be valid - * may generate SME traps from EL1 to EL1 which we - * can't intercept and which would confuse the guest. - * - * Do the same for PSTATE.ZA in the case where there - * is state in the registers which has not already - * been saved, this is very unlikely to happen. - */ - if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { - *host_data_ptr(fp_owner) = FP_STATE_FREE; - fpsimd_save_and_flush_cpu_state(); - } - } + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; - /* - * If normal guests gain SME support, maintain this behavior for pKVM - * guests, which don't support SME. - */ - WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && - read_sysreg_s(SYS_SVCR)); + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); } /* @@ -162,52 +130,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - /* - * If we have VHE then the Hyp code will reset CPACR_EL1 to - * the default value and we need to reenable SME. - */ - if (has_vhe() && system_supports_sme()) { - /* Also restore EL0 state seen on entry */ - if (host_data_test_flag(HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN); - else - sysreg_clear_set(CPACR_EL1, - CPACR_EL1_SMEN_EL0EN, - CPACR_EL1_SMEN_EL1EN); - isb(); - } - if (guest_owns_fp_regs()) { - if (vcpu_has_sve(vcpu)) { - u64 zcr = read_sysreg_el1(SYS_ZCR); - - /* - * If the vCPU is in the hyp context then ZCR_EL1 is - * loaded with its vEL2 counterpart. - */ - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr; - - /* - * Restore the VL that was saved when bound to the CPU, - * which is the maximum VL for the guest. Because the - * layout of the data when saving the sve state depends - * on the VL, we need to use a consistent (i.e., the - * maximum) VL. - * Note that this means that at guest exit ZCR_EL1 is - * not necessarily the same as on guest entry. - * - * ZCR_EL2 holds the guest hypervisor's VL when running - * a nested guest, which could be smaller than the - * max for the vCPU. Similar to above, we first need to - * switch to a VL consistent with the layout of the - * vCPU's SVE state. KVM support for NV implies VHE, so - * using the ZCR_EL1 alias is safe. - */ - if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, - SYS_ZCR_EL1); - } - /* * Flush (save and invalidate) the fpsimd/sve state so that if * the host tries to use fpsimd/sve, it's not using stale data @@ -219,18 +142,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * when needed. */ fpsimd_save_and_flush_cpu_state(); - } else if (has_vhe() && system_supports_sve()) { - /* - * The FPSIMD/SVE state in the CPU has not been touched, and we - * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been - * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE - * for EL0. To avoid spurious traps, restore the trap state - * seen by kvm_arch_vcpu_load_fp(): - */ - if (host_data_test_flag(HOST_SVE_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); - else - sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } local_irq_restore(flags); diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 4433a234aa9b..9f4e8d68ab50 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN alternative_else_nop_endif mrs x1, isr_el1 cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb mov x0, #ARM_EXCEPTION_IRQ ret diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f838a45665f2..23bbe28eaaf9 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -326,7 +326,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } -static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) { *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); @@ -375,7 +375,87 @@ static inline void __hyp_sve_save_host(void) true); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_EL1_ZEN, 0); + + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + /* * We trap the first access to the FP/SIMD to save the host context and @@ -383,7 +463,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); * If FP/SIMD is not implemented, handle the trap and inject an undefined * instruction exception to the guest. Similarly for trapped SVE accesses. */ -static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; @@ -425,7 +505,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) isb(); /* Write out the host state if it's in the registers */ - if (host_owns_fp_regs()) + if (is_protected_kvm_enabled() && host_owns_fp_regs()) kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ @@ -501,9 +581,22 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); + + return offset; +} + static inline u64 compute_counter_value(struct arch_timer_context *ctxt) { - return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); } static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) @@ -587,7 +680,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) @@ -607,7 +700,7 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) { if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) @@ -616,19 +709,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) { if (!__populate_fault_info(vcpu)) return true; return false; } -static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); -static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault -static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) return true; @@ -658,23 +750,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); - -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); - /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * * Returns true if the hypervisor handled the exit, and control should go back * to the guest, or false if it hasn't. */ -static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - exit_handler_fn fn; - - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; - + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; if (fn) return fn(vcpu, exit_code); @@ -704,20 +789,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code * the guest, false when we should restore the host state and return to the * main run loop. */ -static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - /* - * Save PSTATE early so that we can evaluate the vcpu mode - * early on. - */ - synchronize_vcpu_pstate(vcpu, exit_code); - - /* - * Check whether we want to repaint the state one way or - * another. - */ - early_exit_filter(vcpu, exit_code); - if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -747,7 +821,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) goto exit; /* Check if there's an exit handler and allow it to handle the exit. */ - if (kvm_hyp_handle_exit(vcpu, exit_code)) + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) goto guest; exit: /* Return to the host kernel and handle the exit */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6e12c070832f..2c37680d954c 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -5,6 +5,7 @@ */ #include <hyp/adjust_pc.h> +#include <hyp/switch.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -83,7 +84,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (system_supports_sve()) __hyp_sve_restore_host(); else - __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); if (has_fpmr) write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); @@ -224,8 +225,12 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) sync_hyp_vcpu(hyp_vcpu); } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu)); + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); } out: cpu_reg(host_ctxt, 1) = ret; @@ -675,12 +680,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; - case ESR_ELx_EC_SVE: - cpacr_clear_set(0, CPACR_EL1_ZEN); - isb(); - sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, - SYS_ZCR_EL2); - break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 7ad7b133b81a..19c3c631708c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -943,10 +943,10 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; - if (level != KVM_PGTABLE_LAST_LEVEL) - return -E2BIG; if (!kvm_pte_valid(pte)) return -ENOENT; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; state = guest_get_page_state(pte, ipa); if (state != PKVM_PAGE_SHARED_BORROWED) @@ -998,63 +998,73 @@ unlock: return ret; } -int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) { - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); - u64 ipa = hyp_pfn_to_phys(gfn); u64 phys; int ret; - if (prot & ~KVM_PGTABLE_PROT_RWX) - return -EINVAL; + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; host_lock_component(); guest_lock_component(vm); ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); host_unlock_component(); - return ret; + WARN_ON(ret && ret != -ENOENT); } -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); - guest_lock_component(vm); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); - host_unlock_component(); return ret; } -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) { u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa); guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + guest_unlock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + return ret; +} + +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); guest_unlock_component(vm); - host_unlock_component(); return ret; } @@ -1063,18 +1073,14 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; - int ret; - - host_lock_component(); - guest_lock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); - host_unlock_component(); - return ret; + return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6c846d033d24..7d2ba6ef0261 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -39,6 +39,9 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + if (has_hvhe()) { val |= CPACR_EL1_TTA; @@ -47,6 +50,8 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu)) val |= CPACR_EL1_ZEN; } + + write_sysreg(val, cpacr_el1); } else { val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; @@ -61,12 +66,32 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); } +} - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_hvhe()) { + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; - kvm_write_cptr_el2(val); + write_sysreg(val, cpacr_el1); + } else { + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); + } } static void __activate_traps(struct kvm_vcpu *vcpu) @@ -119,7 +144,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } @@ -192,34 +217,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - /* - * Non-protected kvm relies on the host restoring its sve state. - * Protected kvm restores the host's sve state as not to reveal that - * fpsimd was used by a guest nor leak upper sve bits. - */ - if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { - __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - - } else { - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - } - - if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { - u64 val = read_sysreg_s(SYS_FPMR); - - if (unlikely(is_protected_kvm_enabled())) - *host_data_ptr(fpmr) = val; - else - **host_data_ptr(fpmr_ptr) = val; - } -} - static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, @@ -251,19 +248,21 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) return hyp_exit_handlers; } -/* - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the - * hypervisor spots a guest in such a state ensure it is handled, and don't - * trust the host to spot or fix it. The check below is based on the one in - * kvm_arch_vcpu_ioctl_run(). - * - * Returns false if the guest ran in AArch32 when it shouldn't have, and - * thus should exit to the host, or true if a the guest run loop can continue. - */ -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); + + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't @@ -276,6 +275,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } + + return __fixup_guest_exit(vcpu, exit_code, handlers); } /* Switch to the guest for legacy non-VHE systems */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b5b9dbaf1fdd..647737d6e8d0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -136,6 +136,16 @@ write: write_sysreg(val, cpacr_el1); } +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; + + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN; + + write_sysreg(val, cpacr_el1); +} + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -207,7 +217,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); @@ -413,14 +423,6 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - - if (kvm_has_fpmr(vcpu->kvm)) - **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); -} - static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { int ret = -EINVAL; @@ -538,13 +540,10 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - return hyp_exit_handlers; -} + synchronize_vcpu_pstate(vcpu, exit_code); -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) -{ /* * If we were in HYP context on entry, adjust the PSTATE view * so that the usual helpers work correctly. @@ -564,6 +563,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ @@ -578,6 +579,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_save_host_state_vhe(host_ctxt); + fpsimd_lazy_switch_to_guest(vcpu); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -602,6 +605,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + fpsimd_lazy_switch_to_host(vcpu); + sysreg_restore_host_state_vhe(host_ctxt); if (guest_owns_fp_regs()) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index bc7e22ab5d81..775461cf2d2d 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -34,9 +34,9 @@ * * CPU Interface: * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend + * on any sizing information. Private interrupts are allocated if not + * already allocated at vgic-creation time. */ /* EARLY INIT */ @@ -58,6 +58,8 @@ void kvm_vgic_early_init(struct kvm *kvm) /* CREATION */ +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); + /** * kvm_vgic_create: triggered by the instantiation of the VGIC device by * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) @@ -112,6 +114,22 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + goto out_unlock; + } + kvm->arch.vgic.in_kernel = true; kvm->arch.vgic.vgic_model = type; @@ -180,7 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; int i; @@ -218,17 +236,28 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } } return 0; } -static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) { int ret; mutex_lock(&vcpu->kvm->arch.config_lock); - ret = vgic_allocate_private_irqs_locked(vcpu); + ret = vgic_allocate_private_irqs_locked(vcpu, type); mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; @@ -258,7 +287,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm)) return 0; - ret = vgic_allocate_private_irqs(vcpu); + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); if (ret) return ret; @@ -295,7 +324,7 @@ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0; unsigned long idx; lockdep_assert_held(&kvm->arch.config_lock); @@ -315,35 +344,6 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; - /* Initialize groups on CPUs created before the VGIC type was known */ - kvm_for_each_vcpu(idx, vcpu, kvm) { - ret = vgic_allocate_private_irqs_locked(vcpu); - if (ret) - goto out; - - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); - - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = 1U << idx; - break; - default: - ret = -EINVAL; - } - - vgic_put_irq(kvm, irq); - - if (ret) - goto out; - } - } - /* * If we have GICv4.1 enabled, unconditionally request enable the * v4 support so that we get HW-accelerated vSGIs. Otherwise, only diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 19c67ed1a21f..18543b603c77 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -162,6 +162,13 @@ static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, unsigned long next; unsigned long addr = start; + if (pgd_none(READ_ONCE(*dst_pgdp))) { + dst_p4dp = trans_alloc(info); + if (!dst_p4dp) + return -ENOMEM; + pgd_populate(NULL, dst_pgdp, dst_p4dp); + } + dst_p4dp = p4d_offset(dst_pgdp, start); src_p4dp = p4d_offset(src_pgdp, start); do { diff --git a/arch/loongarch/include/asm/cpu-info.h b/arch/loongarch/include/asm/cpu-info.h index 35e0a230a484..7f5bc0ad9d50 100644 --- a/arch/loongarch/include/asm/cpu-info.h +++ b/arch/loongarch/include/asm/cpu-info.h @@ -76,27 +76,6 @@ extern const char *__cpu_full_name[]; #define cpu_family_string() __cpu_family[raw_smp_processor_id()] #define cpu_full_name_string() __cpu_full_name[raw_smp_processor_id()] -struct seq_file; -struct notifier_block; - -extern int register_proc_cpuinfo_notifier(struct notifier_block *nb); -extern int proc_cpuinfo_notifier_call_chain(unsigned long val, void *v); - -#define proc_cpuinfo_notifier(fn, pri) \ -({ \ - static struct notifier_block fn##_nb = { \ - .notifier_call = fn, \ - .priority = pri \ - }; \ - \ - register_proc_cpuinfo_notifier(&fn##_nb); \ -}) - -struct proc_cpuinfo_notifier_args { - struct seq_file *m; - unsigned long n; -}; - static inline bool cpus_are_siblings(int cpua, int cpub) { struct cpuinfo_loongarch *infoa = &cpu_data[cpua]; diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h index 3383c9d24e94..b87d1d5e5890 100644 --- a/arch/loongarch/include/asm/smp.h +++ b/arch/loongarch/include/asm/smp.h @@ -77,6 +77,8 @@ extern int __cpu_logical_map[NR_CPUS]; #define SMP_IRQ_WORK BIT(ACTION_IRQ_WORK) #define SMP_CLEAR_VECTOR BIT(ACTION_CLEAR_VECTOR) +struct seq_file; + struct secondary_data { unsigned long stack; unsigned long thread_info; diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 86d5d90ebefe..4f0912141781 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -18,16 +18,19 @@ .align 5 SYM_FUNC_START(__arch_cpu_idle) - /* start of rollback region */ - LONG_L t0, tp, TI_FLAGS - nop - andi t0, t0, _TIF_NEED_RESCHED - bnez t0, 1f - nop - nop - nop + /* start of idle interrupt region */ + ori t0, zero, CSR_CRMD_IE + /* idle instruction needs irq enabled */ + csrxchg t0, t0, LOONGARCH_CSR_CRMD + /* + * If an interrupt lands here; between enabling interrupts above and + * going idle on the next instruction, we must *NOT* go idle since the + * interrupt could have set TIF_NEED_RESCHED or caused an timer to need + * reprogramming. Fall through -- see handle_vint() below -- and have + * the idle loop take care of things. + */ idle 0 - /* end of rollback region */ + /* end of idle interrupt region */ 1: jr ra SYM_FUNC_END(__arch_cpu_idle) @@ -35,11 +38,10 @@ SYM_CODE_START(handle_vint) UNWIND_HINT_UNDEFINED BACKUP_T0T1 SAVE_ALL - la_abs t1, __arch_cpu_idle + la_abs t1, 1b LONG_L t0, sp, PT_ERA - /* 32 byte rollback region */ - ori t0, t0, 0x1f - xori t0, t0, 0x1f + /* 3 instructions idle interrupt region */ + ori t0, t0, 0b1100 bne t0, t1, 1f LONG_S t0, sp, PT_ERA 1: move a0, sp diff --git a/arch/loongarch/kernel/idle.c b/arch/loongarch/kernel/idle.c index 0b5dd2faeb90..54b247d8cdb6 100644 --- a/arch/loongarch/kernel/idle.c +++ b/arch/loongarch/kernel/idle.c @@ -11,7 +11,6 @@ void __cpuidle arch_cpu_idle(void) { - raw_local_irq_enable(); - __arch_cpu_idle(); /* idle instruction needs irq enabled */ + __arch_cpu_idle(); raw_local_irq_disable(); } diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index 6ce46d92f1f1..cea30768ae92 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -13,28 +13,12 @@ #include <asm/processor.h> #include <asm/time.h> -/* - * No lock; only written during early bootup by CPU 0. - */ -static RAW_NOTIFIER_HEAD(proc_cpuinfo_chain); - -int __ref register_proc_cpuinfo_notifier(struct notifier_block *nb) -{ - return raw_notifier_chain_register(&proc_cpuinfo_chain, nb); -} - -int proc_cpuinfo_notifier_call_chain(unsigned long val, void *v) -{ - return raw_notifier_call_chain(&proc_cpuinfo_chain, val, v); -} - static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; unsigned int isa = cpu_data[n].isa_level; unsigned int version = cpu_data[n].processor_id & 0xff; unsigned int fp_version = cpu_data[n].fpu_vers; - struct proc_cpuinfo_notifier_args proc_cpuinfo_notifier_args; #ifdef CONFIG_SMP if (!cpu_online(n)) @@ -91,20 +75,13 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has_lbt_mips) seq_printf(m, " lbt_mips"); seq_printf(m, "\n"); - seq_printf(m, "Hardware Watchpoint\t: %s", - cpu_has_watch ? "yes, " : "no\n"); + seq_printf(m, "Hardware Watchpoint\t: %s", str_yes_no(cpu_has_watch)); if (cpu_has_watch) { - seq_printf(m, "iwatch count: %d, dwatch count: %d\n", + seq_printf(m, ", iwatch count: %d, dwatch count: %d", cpu_data[n].watch_ireg_count, cpu_data[n].watch_dreg_count); } - proc_cpuinfo_notifier_args.m = m; - proc_cpuinfo_notifier_args.n = n; - - raw_notifier_call_chain(&proc_cpuinfo_chain, 0, - &proc_cpuinfo_notifier_args); - - seq_printf(m, "\n"); + seq_printf(m, "\n\n"); return 0; } diff --git a/arch/loongarch/kernel/reset.c b/arch/loongarch/kernel/reset.c index 1ef8c6383535..de8fa5a8a825 100644 --- a/arch/loongarch/kernel/reset.c +++ b/arch/loongarch/kernel/reset.c @@ -33,7 +33,7 @@ void machine_halt(void) console_flush_on_panic(CONSOLE_FLUSH_PENDING); while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } @@ -53,7 +53,7 @@ void machine_power_off(void) #endif while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } @@ -74,6 +74,6 @@ void machine_restart(char *command) acpi_reboot(); while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index bf9268bf26d5..f6d3242b9234 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -303,9 +303,9 @@ int kvm_arch_enable_virtualization_cpu(void) * TOE=0: Trap on Exception. * TIT=0: Trap on Timer. */ - if (env & CSR_GCFG_GCIP_ALL) + if (env & CSR_GCFG_GCIP_SECURE) gcfg |= CSR_GCFG_GCI_SECURE; - if (env & CSR_GCFG_MATC_ROOT) + if (env & CSR_GCFG_MATP_ROOT) gcfg |= CSR_GCFG_MATC_ROOT; write_csr_gcfg(gcfg); diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 0c292f818492..1be185e94807 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -85,7 +85,7 @@ * Guest CRMD comes from separate GCSR_CRMD register */ ori t0, zero, CSR_PRMD_PIE - csrxchg t0, t0, LOONGARCH_CSR_PRMD + csrwr t0, LOONGARCH_CSR_PRMD /* Set PVM bit to setup ertn to guest context */ ori t0, zero, CSR_GSTAT_PVM diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index fb72095c8077..20f941af3e9e 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1548,9 +1548,6 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Restore timer state regardless */ kvm_restore_timer(vcpu); - - /* Control guest page CCA attribute */ - change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); /* Restore hardware PMU CSRs */ diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c index a5e84b403c3b..df309ae4045d 100644 --- a/arch/loongarch/lib/csum.c +++ b/arch/loongarch/lib/csum.c @@ -25,7 +25,7 @@ unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len) const u64 *ptr; u64 data, sum64 = 0; - if (unlikely(len == 0)) + if (unlikely(len <= 0)) return 0; offset = (unsigned long)buff & 7; diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index bf8678248444..99165903908a 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -3,6 +3,7 @@ * Copyright (C) 2024 Loongson Technology Corporation Limited */ +#include <linux/memblock.h> #include <linux/pagewalk.h> #include <linux/pgtable.h> #include <asm/set_memory.h> @@ -167,7 +168,7 @@ bool kernel_page_present(struct page *page) unsigned long addr = (unsigned long)page_address(page); if (addr < vm_map_base) - return true; + return memblock_is_memory(__pa(addr)); pgd = pgd_offset_k(addr); if (pgd_none(pgdp_get(pgd))) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 4a2b40ce39e0..85fa9962266a 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -27,8 +27,8 @@ */ struct pt_regs { #ifdef CONFIG_32BIT - /* Pad bytes for argument save space on the stack. */ - unsigned long pad0[8]; + /* Saved syscall stack arguments; entries 0-3 unused. */ + unsigned long args[8]; #endif /* Saved main processor registers. */ diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index ebdf4d910af2..056aa1b713e2 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -57,37 +57,21 @@ static inline void mips_syscall_update_nr(struct task_struct *task, static inline void mips_get_syscall_arg(unsigned long *arg, struct task_struct *task, struct pt_regs *regs, unsigned int n) { - unsigned long usp __maybe_unused = regs->regs[29]; - +#ifdef CONFIG_32BIT switch (n) { case 0: case 1: case 2: case 3: *arg = regs->regs[4 + n]; - - return; - -#ifdef CONFIG_32BIT - case 4: case 5: case 6: case 7: - get_user(*arg, (int *)usp + n); return; -#endif - -#ifdef CONFIG_64BIT case 4: case 5: case 6: case 7: -#ifdef CONFIG_MIPS32_O32 - if (test_tsk_thread_flag(task, TIF_32BIT_REGS)) - get_user(*arg, (int *)usp + n); - else -#endif - *arg = regs->regs[4 + n]; - + *arg = regs->args[n]; return; -#endif - - default: - BUG(); } - - unreachable(); +#else + *arg = regs->regs[4 + n]; + if ((IS_ENABLED(CONFIG_MIPS32_O32) && + test_tsk_thread_flag(task, TIF_32BIT_REGS))) + *arg = (unsigned int)*arg; +#endif } static inline long syscall_get_error(struct task_struct *task, diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index cb1045ebab06..b910ec54a3a1 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -27,6 +27,12 @@ void output_ptreg_defines(void); void output_ptreg_defines(void) { COMMENT("MIPS pt_regs offsets."); +#ifdef CONFIG_32BIT + OFFSET(PT_ARG4, pt_regs, args[4]); + OFFSET(PT_ARG5, pt_regs, args[5]); + OFFSET(PT_ARG6, pt_regs, args[6]); + OFFSET(PT_ARG7, pt_regs, args[7]); +#endif OFFSET(PT_R0, pt_regs, regs[0]); OFFSET(PT_R1, pt_regs, regs[1]); OFFSET(PT_R2, pt_regs, regs[2]); diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 2c604717e630..4947a4f39e37 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -64,10 +64,10 @@ load_a6: user_lw(t7, 24(t0)) # argument #7 from usp load_a7: user_lw(t8, 28(t0)) # argument #8 from usp loads_done: - sw t5, 16(sp) # argument #5 to ksp - sw t6, 20(sp) # argument #6 to ksp - sw t7, 24(sp) # argument #7 to ksp - sw t8, 28(sp) # argument #8 to ksp + sw t5, PT_ARG4(sp) # argument #5 to ksp + sw t6, PT_ARG5(sp) # argument #6 to ksp + sw t7, PT_ARG6(sp) # argument #7 to ksp + sw t8, PT_ARG7(sp) # argument #8 to ksp .set pop .section __ex_table,"a" diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index d6beec5292a0..44f01a4bc810 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -740,7 +740,6 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8cfbfb10bba8..8bcd37edd3c9 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -725,7 +725,6 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index bcbaa069de96..853b2326a171 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -62,7 +62,6 @@ CONFIG_ZFCP=y # CONFIG_INOTIFY_USER is not set # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_LSM="yama,loadpin,safesetid,integrity" # CONFIG_ZLIB_DFLTCC is not set CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d5125296ade2..a5ca0a947691 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -53,7 +53,11 @@ static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsig unsigned long mask; int cc; - if (__builtin_constant_p(nr)) { + /* + * With CONFIG_PROFILE_ALL_BRANCHES enabled gcc fails to + * handle __builtin_constant_p() in some cases. + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && __builtin_constant_p(nr)) { addr = (const volatile unsigned char *)ptr; addr += (nr ^ (BITS_PER_LONG - BITS_PER_BYTE)) / BITS_PER_BYTE; mask = 1UL << (nr & (BITS_PER_BYTE - 1)); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 857afbc4828f..39a481ec4a40 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -331,6 +331,17 @@ error: return rc; } +static bool zpci_bus_is_isolated_vf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + + pdev = zpci_iov_find_parent_pf(zbus, zdev); + if (!pdev) + return true; + pci_dev_put(pdev); + return false; +} + int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { bool topo_is_tid = zdev->tid_avail; @@ -345,6 +356,15 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) topo = topo_is_tid ? zdev->tid : zdev->pchid; zbus = zpci_bus_get(topo, topo_is_tid); + /* + * An isolated VF gets its own domain/bus even if there exists + * a matching domain/bus already + */ + if (zbus && zpci_bus_is_isolated_vf(zbus, zdev)) { + zpci_bus_put(zbus); + zbus = NULL; + } + if (!zbus) { zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index ead062bf2b41..191e56a623f6 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -60,18 +60,35 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in return 0; } -int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +/** + * zpci_iov_find_parent_pf - Find the parent PF, if any, of the given function + * @zbus: The bus that the PCI function is on, or would be added on + * @zdev: The PCI function + * + * Finds the parent PF, if it exists and is configured, of the given PCI function + * and increments its refcount. Th PF is searched for on the provided bus so the + * caller has to ensure that this is the correct bus to search. This function may + * be used before adding the PCI function to a zbus. + * + * Return: Pointer to the struct pci_dev of the parent PF or NULL if it not + * found. If the function is not a VF or has no RequesterID information, + * NULL is returned as well. + */ +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { - int i, cand_devfn; - struct zpci_dev *zdev; + int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; - int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ - int rc = 0; if (!zbus->multifunction) - return 0; - - /* If the parent PF for the given VF is also configured in the + return NULL; + /* Non-VFs and VFs without RID available don't have a parent */ + if (!zdev->vfn || !zdev->rid_available) + return NULL; + /* Linux vfid starts at 0 vfn at 1 */ + vfid = zdev->vfn - 1; + devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; + /* + * If the parent PF for the given VF is also configured in the * instance, it must be on the same zbus. * We can then identify the parent PF by checking what * devfn the VF would have if it belonged to that PF using the PF's @@ -85,15 +102,26 @@ int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn if (!pdev) continue; cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); - if (cand_devfn == virtfn->devfn) { - rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); - /* balance pci_get_slot() */ - pci_dev_put(pdev); - break; - } + if (cand_devfn == devfn) + return pdev; /* balance pci_get_slot() */ pci_dev_put(pdev); } } + return NULL; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + struct zpci_dev *zdev = to_zpci(virtfn); + struct pci_dev *pdev_pf; + int rc = 0; + + pdev_pf = zpci_iov_find_parent_pf(zbus, zdev); + if (pdev_pf) { + /* Linux' vfids start at 0 while zdev->vfn starts at 1 */ + rc = zpci_iov_link_virtfn(pdev_pf, virtfn, zdev->vfn - 1); + pci_dev_put(pdev_pf); + } return rc; } diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h index e3fa4e77fc86..d2c2793eb0f3 100644 --- a/arch/s390/pci/pci_iov.h +++ b/arch/s390/pci/pci_iov.h @@ -19,6 +19,8 @@ void zpci_iov_map_resources(struct pci_dev *pdev); int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev); + #else /* CONFIG_PCI_IOV */ static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} @@ -28,5 +30,10 @@ static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *v { return 0; } + +static inline struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + return NULL; +} #endif /* CONFIG_PCI_IOV */ #endif /* __S390_PCI_IOV_h */ diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 744e7f31e8ef..dd5580f975cc 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -25,8 +25,10 @@ #define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) #define NUM_IRQ_MSGS 10 -#define HANDLE_NO_FREE(ptr) ((void *)((unsigned long)(ptr) | 1)) -#define HANDLE_IS_NO_FREE(ptr) ((unsigned long)(ptr) & 1) +struct um_pci_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; struct um_pci_device { struct virtio_device *vdev; @@ -36,6 +38,11 @@ struct um_pci_device { struct virtqueue *cmd_vq, *irq_vq; +#define UM_PCI_WRITE_BUFS 20 + struct um_pci_message_buffer bufs[UM_PCI_WRITE_BUFS + 1]; + void *extra_ptrs[UM_PCI_WRITE_BUFS + 1]; + DECLARE_BITMAP(used_bufs, UM_PCI_WRITE_BUFS); + #define UM_PCI_STAT_WAITING 0 unsigned long status; @@ -61,12 +68,40 @@ static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; static unsigned int um_pci_max_delay_us = 40000; module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644); -struct um_pci_message_buffer { - struct virtio_pcidev_msg hdr; - u8 data[8]; -}; +static int um_pci_get_buf(struct um_pci_device *dev, bool *posted) +{ + int i; + + for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { + if (!test_and_set_bit(i, dev->used_bufs)) + return i; + } -static struct um_pci_message_buffer __percpu *um_pci_msg_bufs; + *posted = false; + return UM_PCI_WRITE_BUFS; +} + +static void um_pci_free_buf(struct um_pci_device *dev, void *buf) +{ + int i; + + if (buf == &dev->bufs[UM_PCI_WRITE_BUFS]) { + kfree(dev->extra_ptrs[UM_PCI_WRITE_BUFS]); + dev->extra_ptrs[UM_PCI_WRITE_BUFS] = NULL; + return; + } + + for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { + if (buf == &dev->bufs[i]) { + kfree(dev->extra_ptrs[i]); + dev->extra_ptrs[i] = NULL; + WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); + return; + } + } + + WARN_ON(1); +} static int um_pci_send_cmd(struct um_pci_device *dev, struct virtio_pcidev_msg *cmd, @@ -82,7 +117,9 @@ static int um_pci_send_cmd(struct um_pci_device *dev, }; struct um_pci_message_buffer *buf; int delay_count = 0; + bool bounce_out; int ret, len; + int buf_idx; bool posted; if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) @@ -101,26 +138,28 @@ static int um_pci_send_cmd(struct um_pci_device *dev, break; } - buf = get_cpu_var(um_pci_msg_bufs); - if (buf) - memcpy(buf, cmd, cmd_size); + bounce_out = !posted && cmd_size <= sizeof(*cmd) && + out && out_size <= sizeof(buf->data); - if (posted) { - u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); - - if (ncmd) { - memcpy(ncmd, cmd, cmd_size); - if (extra) - memcpy(ncmd + cmd_size, extra, extra_size); - cmd = (void *)ncmd; - cmd_size += extra_size; - extra = NULL; - extra_size = 0; - } else { - /* try without allocating memory */ - posted = false; - cmd = (void *)buf; + buf_idx = um_pci_get_buf(dev, &posted); + buf = &dev->bufs[buf_idx]; + memcpy(buf, cmd, cmd_size); + + if (posted && extra && extra_size > sizeof(buf) - cmd_size) { + dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, + GFP_ATOMIC); + + if (!dev->extra_ptrs[buf_idx]) { + um_pci_free_buf(dev, buf); + return -ENOMEM; } + extra = dev->extra_ptrs[buf_idx]; + } else if (extra && extra_size <= sizeof(buf) - cmd_size) { + memcpy((u8 *)buf + cmd_size, extra, extra_size); + cmd_size += extra_size; + extra_size = 0; + extra = NULL; + cmd = (void *)buf; } else { cmd = (void *)buf; } @@ -128,39 +167,40 @@ static int um_pci_send_cmd(struct um_pci_device *dev, sg_init_one(&out_sg, cmd, cmd_size); if (extra) sg_init_one(&extra_sg, extra, extra_size); - if (out) + /* allow stack for small buffers */ + if (bounce_out) + sg_init_one(&in_sg, buf->data, out_size); + else if (out) sg_init_one(&in_sg, out, out_size); /* add to internal virtio queue */ ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, extra ? 2 : 1, out ? 1 : 0, - posted ? cmd : HANDLE_NO_FREE(cmd), - GFP_ATOMIC); + cmd, GFP_ATOMIC); if (ret) { - if (posted) - kfree(cmd); - goto out; + um_pci_free_buf(dev, buf); + return ret; } if (posted) { virtqueue_kick(dev->cmd_vq); - ret = 0; - goto out; + return 0; } /* kick and poll for getting a response on the queue */ set_bit(UM_PCI_STAT_WAITING, &dev->status); virtqueue_kick(dev->cmd_vq); + ret = 0; while (1) { void *completed = virtqueue_get_buf(dev->cmd_vq, &len); - if (completed == HANDLE_NO_FREE(cmd)) + if (completed == buf) break; - if (completed && !HANDLE_IS_NO_FREE(completed)) - kfree(completed); + if (completed) + um_pci_free_buf(dev, completed); if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || ++delay_count > um_pci_max_delay_us, @@ -172,8 +212,11 @@ static int um_pci_send_cmd(struct um_pci_device *dev, } clear_bit(UM_PCI_STAT_WAITING, &dev->status); -out: - put_cpu_var(um_pci_msg_bufs); + if (bounce_out) + memcpy(out, buf->data, out_size); + + um_pci_free_buf(dev, buf); + return ret; } @@ -187,20 +230,13 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, .size = size, .addr = offset, }; - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - size_t bytes = sizeof(buf->data); + /* max 8, we might not use it all */ + u8 data[8]; if (!dev) return ULONG_MAX; - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; - - if (buf) - memset(data, 0xff, bytes); + memset(data, 0xff, sizeof(data)); switch (size) { case 1: @@ -212,34 +248,26 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; + return ULONG_MAX; } - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes)) - goto out; + if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) + return ULONG_MAX; switch (size) { case 1: - ret = data[0]; - break; + return data[0]; case 2: - ret = le16_to_cpup((void *)data); - break; + return le16_to_cpup((void *)data); case 4: - ret = le32_to_cpup((void *)data); - break; + return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; + return le64_to_cpup((void *)data); #endif default: - break; + return ULONG_MAX; } - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; } static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, @@ -312,13 +340,8 @@ static void um_pci_bar_copy_from(void *priv, void *buffer, static unsigned long um_pci_bar_read(void *priv, unsigned int offset, int size) { - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; + /* 8 is maximum size - we may only use parts of it */ + u8 data[8]; switch (size) { case 1: @@ -330,33 +353,25 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; + return ULONG_MAX; } um_pci_bar_copy_from(priv, data, offset, size); switch (size) { case 1: - ret = data[0]; - break; + return data[0]; case 2: - ret = le16_to_cpup((void *)data); - break; + return le16_to_cpup((void *)data); case 4: - ret = le32_to_cpup((void *)data); - break; + return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; + return le64_to_cpup((void *)data); #endif default: - break; + return ULONG_MAX; } - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; } static void um_pci_bar_copy_to(void *priv, unsigned int offset, @@ -523,11 +538,8 @@ static void um_pci_cmd_vq_cb(struct virtqueue *vq) if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) return; - while ((cmd = virtqueue_get_buf(vq, &len))) { - if (WARN_ON(HANDLE_IS_NO_FREE(cmd))) - continue; - kfree(cmd); - } + while ((cmd = virtqueue_get_buf(vq, &len))) + um_pci_free_buf(dev, cmd); } static void um_pci_irq_vq_cb(struct virtqueue *vq) @@ -1006,10 +1018,6 @@ static int __init um_pci_init(void) "No virtio device ID configured for PCI - no PCI support\n")) return 0; - um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer); - if (!um_pci_msg_bufs) - return -ENOMEM; - bridge = pci_alloc_host_bridge(0); if (!bridge) { err = -ENOMEM; @@ -1070,7 +1078,6 @@ free: pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); } - free_percpu(um_pci_msg_bufs); return err; } module_init(um_pci_init); @@ -1082,6 +1089,5 @@ static void __exit um_pci_exit(void) irq_domain_remove(um_pci_inner_domain); pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); - free_percpu(um_pci_msg_bufs); } module_exit(um_pci_exit); diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 65df43fa9be5..ad8d78fb1d9a 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -52,7 +52,7 @@ struct virtio_uml_device { struct platform_device *pdev; struct virtio_uml_platform_data *pdata; - spinlock_t sock_lock; + raw_spinlock_t sock_lock; int sock, req_fd, irq; u64 features; u64 protocol_features; @@ -246,7 +246,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, if (request_ack) msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; - spin_lock_irqsave(&vu_dev->sock_lock, flags); + raw_spin_lock_irqsave(&vu_dev->sock_lock, flags); rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); if (rc < 0) goto out; @@ -266,7 +266,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, } out: - spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + raw_spin_unlock_irqrestore(&vu_dev->sock_lock, flags); return rc; } @@ -1239,7 +1239,7 @@ static int virtio_uml_probe(struct platform_device *pdev) goto error_free; vu_dev->sock = rc; - spin_lock_init(&vu_dev->sock_lock); + raw_spin_lock_init(&vu_dev->sock_lock); rc = vhost_user_init(vu_dev); if (rc) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 338450741aac..a4991746f5ea 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -52,7 +52,7 @@ struct irq_entry { bool sigio_workaround; }; -static DEFINE_SPINLOCK(irq_lock); +static DEFINE_RAW_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; @@ -257,7 +257,7 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) return NULL; } -static void free_irq_entry(struct irq_entry *to_free, bool remove) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { if (!to_free) return; @@ -265,7 +265,6 @@ static void free_irq_entry(struct irq_entry *to_free, bool remove) if (remove) os_del_epoll_fd(to_free->fd); list_del(&to_free->list); - kfree(to_free); } static bool update_irq_entry(struct irq_entry *entry) @@ -286,17 +285,19 @@ static bool update_irq_entry(struct irq_entry *entry) return false; } -static void update_or_free_irq_entry(struct irq_entry *entry) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - if (!update_irq_entry(entry)) - free_irq_entry(entry, false); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, void (*timetravel_handler)(int, int, void *, struct time_travel_event *)) { - struct irq_entry *irq_entry; + struct irq_entry *irq_entry, *to_free = NULL; int err, events = os_event_mask(type); unsigned long flags; @@ -304,9 +305,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, if (err < 0) goto out; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); irq_entry = get_irq_entry_by_fd(fd); if (irq_entry) { +already: /* cannot register the same FD twice with the same type */ if (WARN_ON(irq_entry->reg[type].events)) { err = -EALREADY; @@ -316,11 +318,22 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, /* temporarily disable to avoid IRQ-side locking */ os_del_epoll_fd(fd); } else { - irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); - if (!irq_entry) { - err = -ENOMEM; - goto out_unlock; + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; + } + irq_entry = new; irq_entry->fd = fd; list_add_tail(&irq_entry->list, &active_fds); maybe_sigio_broken(fd); @@ -339,12 +352,11 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, #endif WARN_ON(!update_irq_entry(irq_entry)); - spin_unlock_irqrestore(&irq_lock, flags); - - return 0; + err = 0; out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); out: + kfree(to_free); return err; } @@ -358,19 +370,20 @@ void free_irq_by_fd(int fd) struct irq_entry *to_free; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - free_irq_entry(to_free, true); - spin_unlock_irqrestore(&irq_lock, flags); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *entry; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type i; @@ -386,12 +399,13 @@ static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) os_del_epoll_fd(entry->fd); reg->events = 0; - update_or_free_irq_entry(entry); + to_free = update_or_remove_irq_entry(entry); goto out; } } out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) @@ -402,7 +416,7 @@ void deactivate_fd(int fd, int irqnum) os_del_epoll_fd(fd); - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); entry = get_irq_entry_by_fd(fd); if (!entry) goto out; @@ -414,9 +428,10 @@ void deactivate_fd(int fd, int irqnum) entry->reg[i].events = 0; } - update_or_free_irq_entry(entry); + entry = update_or_remove_irq_entry(entry); out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -546,7 +561,7 @@ void um_irqs_suspend(void) irqs_suspended = true; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; bool clear = true; @@ -579,7 +594,7 @@ void um_irqs_suspend(void) !__ignore_sigio_fd(entry->fd); } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); } void um_irqs_resume(void) @@ -588,7 +603,7 @@ void um_irqs_resume(void) unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); @@ -602,7 +617,7 @@ void um_irqs_resume(void) } } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); irqs_suspended = false; send_sigio_to_self(); @@ -613,7 +628,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) struct irq_entry *entry; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; @@ -628,7 +643,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) } } unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); return 0; } #else diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index e5a2d4d897e0..0cd6fad3d908 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -191,7 +191,15 @@ void initial_thread_cb(void (*proc)(void *), void *arg) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size); + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + return 0; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index f683cfc9e51a..e2f8f156402f 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -181,6 +181,10 @@ extern char __syscall_stub_start[]; static int stub_exe_fd; +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + static int userspace_tramp(void *stack) { char *const argv[] = { "uml-userspace", NULL }; @@ -202,8 +206,12 @@ static int userspace_tramp(void *stack) init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); init_data.stub_data_offset = MMAP_OFFSET(offset); - /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ - close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); + /* + * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs + * and then unsetting it on all memory related FDs. + * This is not strictly necessary from a safety perspective. + */ + syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); fcntl(init_data.stub_data_fd, F_SETFD, 0); for (iomem = iomem_regions; iomem; iomem = iomem->next) @@ -224,7 +232,9 @@ static int userspace_tramp(void *stack) if (ret != sizeof(init_data)) exit(4); - execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); + /* Raw execveat for compatibility with older libc versions */ + syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", + (unsigned long)argv, NULL, AT_EMPTY_PATH); exit(5); } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 87198d957e2f..be2c311f5118 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2599,7 +2599,8 @@ config MITIGATION_IBPB_ENTRY depends on CPU_SUP_AMD && X86_64 default y help - Compile the kernel with support for the retbleed=ibpb mitigation. + Compile the kernel with support for the retbleed=ibpb and + spec_rstack_overflow={ibpb,ibpb-vmexit} mitigations. config MITIGATION_IBRS_ENTRY bool "Enable IBRS on kernel entry" diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7601196d1d18..e86333eee266 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4905,20 +4905,22 @@ static inline bool intel_pmu_broken_perf_cap(void) static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { - unsigned int sub_bitmaps, eax, ebx, ecx, edx; + unsigned int cntr, fixed_cntr, ecx, edx; + union cpuid35_eax eax; + union cpuid35_ebx ebx; - cpuid(ARCH_PERFMON_EXT_LEAF, &sub_bitmaps, &ebx, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); - if (ebx & ARCH_PERFMON_EXT_UMASK2) + if (ebx.split.umask2) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx & ARCH_PERFMON_EXT_EQ) + if (ebx.split.eq) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; - if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) { + if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &eax, &ebx, &ecx, &edx); - pmu->cntr_mask64 = eax; - pmu->fixed_cntr_mask64 = ebx; + &cntr, &fixed_cntr, &ecx, &edx); + pmu->cntr_mask64 = cntr; + pmu->fixed_cntr_mask64 = fixed_cntr; } if (!intel_pmu_broken_perf_cap()) { @@ -4941,11 +4943,6 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); - if (pmu->intel_cap.pebs_output_pt_available) - pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; - else - pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; - intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, pmu->fixed_cntr_mask64, @@ -5023,9 +5020,6 @@ static bool init_hybrid_pmu(int cpu) pr_info("%s PMU driver: ", pmu->name); - if (pmu->intel_cap.pebs_output_pt_available) - pr_cont("PEBS-via-PT "); - pr_cont("\n"); x86_pmu_show_pmu_cap(&pmu->pmu); @@ -5048,8 +5042,11 @@ static void intel_pmu_cpu_starting(int cpu) init_debug_store_on_cpu(cpu); /* - * Deal with CPUs that don't clear their LBRs on power-up. + * Deal with CPUs that don't clear their LBRs on power-up, and that may + * even boot with LBRs enabled. */ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && x86_pmu.lbr_nr) + msr_clear_bit(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR_BIT); intel_pmu_lbr_reset(); cpuc->lbr_sel = NULL; @@ -6370,11 +6367,9 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; if (pmu->pmu_type & hybrid_small_tiny) { pmu->intel_cap.perf_metrics = 0; - pmu->intel_cap.pebs_output_pt_available = 1; pmu->mid_ack = true; } else if (pmu->pmu_type & hybrid_big) { pmu->intel_cap.perf_metrics = 1; - pmu->intel_cap.pebs_output_pt_available = 0; pmu->late_ack = true; } } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba74e1198328..c2e2eae7309c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2578,7 +2578,15 @@ void __init intel_ds_init(void) } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); - if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { + /* + * The PEBS-via-PT is not supported on hybrid platforms, + * because not all CPUs of a hybrid machine support it. + * The global x86_pmu.intel_cap, which only contains the + * common capabilities, is used to check the availability + * of the feature. The per-PMU pebs_output_pt_available + * in a hybrid machine should be ignored. + */ + if (x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; } diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index d3bb3865c1b1..4952faf03e82 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -370,6 +370,10 @@ static int rapl_pmu_event_init(struct perf_event *event) unsigned int rapl_pmu_idx; struct rapl_pmus *rapl_pmus; + /* only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + /* unsupported modes and filters */ if (event->attr.sample_period) /* no sampling */ return -EINVAL; @@ -387,10 +391,6 @@ static int rapl_pmu_event_init(struct perf_event *event) rapl_pmus_scope = rapl_pmus->pmu.scope; if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { - /* only look at RAPL package events */ - if (event->attr.type != rapl_pmus_pkg->pmu.type) - return -ENOENT; - cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; @@ -398,10 +398,6 @@ static int rapl_pmu_event_init(struct perf_event *event) bit = cfg - 1; event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { - /* only look at RAPL core events */ - if (event->attr.type != rapl_pmus_core->pmu.type) - return -ENOENT; - cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c35550581da0..823c0434bbad 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -48,6 +48,7 @@ KVM_X86_OP(set_idt) KVM_X86_OP(get_gdt) KVM_X86_OP(set_gdt) KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr6) KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5c..0b7af5902ff7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1696,6 +1696,7 @@ struct kvm_x86_ops { void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); + void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9a71880eec07..72765b2fe0d8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -395,7 +395,8 @@ #define MSR_IA32_PASID_VALID BIT_ULL(31) /* DEBUGCTLMSR bits (others vary by model): */ -#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_LBR_BIT 0 /* last branch recording */ +#define DEBUGCTLMSR_LBR (1UL << DEBUGCTLMSR_LBR_BIT) #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 1ac79f361645..0ba8d20f2d1d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,11 +188,33 @@ union cpuid10_edx { * detection/enumeration details: */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 -#define ARCH_PERFMON_EXT_UMASK2 0x1 -#define ARCH_PERFMON_EXT_EQ 0x2 -#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +union cpuid35_eax { + struct { + unsigned int leaf0:1; + /* Counters Sub-Leaf */ + unsigned int cntr_subleaf:1; + /* Auto Counter Reload Sub-Leaf */ + unsigned int acr_subleaf:1; + /* Events Sub-Leaf */ + unsigned int events_subleaf:1; + unsigned int reserved:28; + } split; + unsigned int full; +}; + +union cpuid35_ebx { + struct { + /* UnitMask2 Supported */ + unsigned int umask2:1; + /* EQ-bit Supported */ + unsigned int eq:1; + unsigned int reserved:30; + } split; + unsigned int full; +}; + /* * Intel Architectural LBR CPUID detection/enumeration details: */ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 5d9685f92e5c..1581246491b5 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -531,6 +531,7 @@ static inline void __init snp_secure_tsc_init(void) { } #ifdef CONFIG_KVM_AMD_SEV bool snp_probe_rmptable_info(void); +int snp_rmptable_init(void); int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); @@ -541,6 +542,7 @@ void kdump_sev_callback(void); void snp_fixup_e820_tables(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } +static inline int snp_rmptable_init(void) { return -ENOSYS; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } static inline void snp_dump_hva_rmpentry(unsigned long address) {} static inline int psmash(u64 pfn) { return -ENODEV; } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5a505aa65489..a5d0998d7604 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1115,6 +1115,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + mitigate_smt = true; /* * IBPB on entry already obviates the need for @@ -1124,9 +1126,6 @@ do_cmd_auto: setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - mitigate_smt = true; - /* * There is no need for RSB filling: entry_ibpb() ensures * all predictions, including the RSB, are invalidated, @@ -2646,6 +2645,7 @@ static void __init srso_select_mitigation(void) if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB; /* @@ -2655,6 +2655,13 @@ static void __init srso_select_mitigation(void) */ setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); @@ -2663,8 +2670,8 @@ static void __init srso_select_mitigation(void) ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { + if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; @@ -2676,8 +2683,8 @@ ibpb_on_vmexit: setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { - pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); - } + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + } break; default: break; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a6dd5a84f22..6ebeb6cea6c0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2226,6 +2226,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2855,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 74c20dbb92da..d4ac4a1f8b81 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5540,7 +5540,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d77b094d9a4d..04c375bf1ac2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. @@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2a794c32050..0dbb25442ec1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2972,6 +2972,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7640a84e554a..a713c803a3a3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2427f918e763..43ee9ed11291 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d..6c56d5235f0f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3295a67c04..430773a5ef8e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e77e61d4fbd..02159c967d29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 76eaeb93928c..eb1cdadc8a61 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -18,6 +18,7 @@ #include <registers.h> #include <sys/mman.h> +static unsigned long ptrace_regset; unsigned long host_fp_size; int get_fp_registers(int pid, unsigned long *regs) @@ -27,7 +28,7 @@ int get_fp_registers(int pid, unsigned long *regs) .iov_len = host_fp_size, }; - if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + if (ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov) < 0) return -errno; return 0; } @@ -39,7 +40,7 @@ int put_fp_registers(int pid, unsigned long *regs) .iov_len = host_fp_size, }; - if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + if (ptrace(PTRACE_SETREGSET, pid, ptrace_regset, &iov) < 0) return -errno; return 0; } @@ -58,9 +59,23 @@ int arch_init_registers(int pid) return -ENOMEM; /* GDB has x86_xsave_length, which uses x86_cpuid_count */ - ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + ptrace_regset = NT_X86_XSTATE; + ret = ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov); if (ret) ret = -errno; + + if (ret == -ENODEV) { +#ifdef CONFIG_X86_32 + ptrace_regset = NT_PRXFPREG; +#else + ptrace_regset = NT_PRFPREG; +#endif + iov.iov_len = 2 * 1024 * 1024; + ret = ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov); + if (ret) + ret = -errno; + } + munmap(iov.iov_base, 2 * 1024 * 1024); host_fp_size = iov.iov_len; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 75087e85b6fd..2934e170b0fe 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -187,7 +187,12 @@ static int copy_sc_to_user(struct sigcontext __user *to, * Put magic/size values for userspace. We do not bother to verify them * later on, however, userspace needs them should it try to read the * XSTATE data. And ptrace does not fill in these parts. + * + * Skip this if we do not have an XSTATE frame. */ + if (host_fp_size <= sizeof(to_fp64->fpstate)) + return 0; + BUILD_BUG_ON(sizeof(int) != FP_XSTATE_MAGIC2_SIZE); #ifdef CONFIG_X86_32 __put_user(offsetof(struct _fpstate_32, _fxsr_env) + @@ -367,11 +372,13 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, int err = 0, sig = ksig->sig; unsigned long fp_to; - frame = (struct rt_sigframe __user *) - round_down(stack_top - sizeof(struct rt_sigframe), 16); + frame = (void __user *)stack_top - sizeof(struct rt_sigframe); /* Add required space for math frame */ - frame = (struct rt_sigframe __user *)((unsigned long)frame - math_size); + frame = (void __user *)((unsigned long)frame - math_size); + + /* ABI requires 16 byte boundary alignment */ + frame = (void __user *)round_down((unsigned long)frame, 16); /* Subtract 128 for a red zone and 8 for proper alignment */ frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 1dcc027ec77e..42e74a5a7d78 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -505,19 +505,19 @@ static bool __init setup_rmptable(void) * described in the SNP_INIT_EX firmware command description in the SNP * firmware ABI spec. */ -static int __init snp_rmptable_init(void) +int __init snp_rmptable_init(void) { unsigned int i; u64 val; - if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) - return 0; + if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP))) + return -ENOSYS; - if (!amd_iommu_snp_en) - goto nosnp; + if (WARN_ON_ONCE(!amd_iommu_snp_en)) + return -ENOSYS; if (!setup_rmptable()) - goto nosnp; + return -ENOSYS; /* * Check if SEV-SNP is already enabled, this can happen in case of @@ -530,7 +530,7 @@ static int __init snp_rmptable_init(void) /* Zero out the RMP bookkeeping area */ if (!clear_rmptable_bookkeeping()) { free_rmp_segment_table(); - goto nosnp; + return -ENOSYS; } /* Zero out the RMP entries */ @@ -562,17 +562,8 @@ skip_enable: crash_kexec_post_notifiers = true; return 0; - -nosnp: - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return -ENOSYS; } -/* - * This must be called after the IOMMU has been initialized. - */ -device_initcall(snp_rmptable_init); - static void set_rmp_segment_info(unsigned int segment_shift) { rmp_segment_shift = segment_shift; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c70cd35e72c..d078de2c952b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -111,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; */ static DEFINE_SPINLOCK(xen_reservation_lock); +/* Protected by xen_reservation_lock. */ +#define MIN_CONTIG_ORDER 9 /* 2MB */ +static unsigned int discontig_frames_order = MIN_CONTIG_ORDER; +static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata; +static unsigned long *discontig_frames __refdata = discontig_frames_early; +static bool discontig_frames_dyn; + +static int alloc_discontig_frames(unsigned int order) +{ + unsigned long *new_array, *old_array; + unsigned int old_order; + unsigned long flags; + + BUG_ON(order < MIN_CONTIG_ORDER); + BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE); + + new_array = (unsigned long *)__get_free_pages(GFP_KERNEL, + order - MIN_CONTIG_ORDER); + if (!new_array) + return -ENOMEM; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + old_order = discontig_frames_order; + + if (order > discontig_frames_order || !discontig_frames_dyn) { + if (!discontig_frames_dyn) + old_array = NULL; + else + old_array = discontig_frames; + + discontig_frames = new_array; + discontig_frames_order = order; + discontig_frames_dyn = true; + } else { + old_array = new_array; + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER); + + return 0; +} + /* * Note about cr3 (pagetable base) values: * @@ -814,6 +859,9 @@ static void __init xen_after_bootmem(void) SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); + + if (alloc_discontig_frames(MIN_CONTIG_ORDER)) + BUG(); } static void xen_unpin_page(struct mm_struct *mm, struct page *page, @@ -2203,10 +2251,6 @@ void __init xen_init_mmu_ops(void) memset(dummy_mapping, 0xff, PAGE_SIZE); } -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; - #define VOID_PTE (mfn_pte(0, __pgprot(0))) static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, unsigned long *in_frames, @@ -2323,18 +2367,25 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle) { - unsigned long *in_frames = discontig_frames, out_frame; + unsigned long *in_frames, out_frame; unsigned long flags; int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; + if (unlikely(order > discontig_frames_order)) { + if (!discontig_frames_dyn) + return -ENOMEM; + + if (alloc_discontig_frames(order)) + return -ENOMEM; + } memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); + in_frames = discontig_frames; + /* 1. Zap current PTEs, remembering MFNs. */ xen_zap_pfn_range(vstart, order, in_frames, NULL); @@ -2358,12 +2409,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { - unsigned long *out_frames = discontig_frames, in_frame; + unsigned long *out_frames, in_frame; unsigned long flags; int success; unsigned long vstart; - if (unlikely(order > MAX_CONTIG_ORDER)) + if (unlikely(order > discontig_frames_order)) return; vstart = (unsigned long)phys_to_virt(pstart); @@ -2371,6 +2422,8 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); + out_frames = discontig_frames; + /* 1. Find start MFN of contiguous extent. */ in_frame = virt_to_mfn((void *)vstart); |