diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2026-04-13 13:04:48 +0200 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2026-04-13 13:04:48 +0200 |
| commit | 4a530993dafec27085321424aeab303eb0e7869e (patch) | |
| tree | 554aa71aa3efaa15a309586672d50023fd6fd96d | |
| parent | ea8bc95fbb75da215b7533c7c46f63423e84ff5e (diff) | |
| parent | e30aa03d032df0f3ee5efb1995a7a2fe662177be (diff) | |
Merge tag 'kvm-x86-vmxon-7.1' of https://github.com/kvm-x86/linux into HEAD
KVM x86 VMXON and EFER.SVME extraction for 7.1
Move _only_ VMXON+VMXOFF and EFER.SVME toggling out of KVM (versus all of VMX
and SVM enabling) out of KVM and into the core kernel so that non-KVM TDX
enabling, e.g. for trusted I/O, can make SEAMCALLs without needing to ensure
KVM is fully loaded.
TIO isn't a hypervisor, and isn't trying to be a hypervisor. Specifically, TIO
should _never_ have it's own VMCSes (that are visible to the host; the
TDX-Module has it's own VMCSes to do SEAMCALL/SEAMRET), and so there is simply
no reason to move that functionality out of KVM.
With that out of the way, dealing with VMXON/VMXOFF and EFER.SVME is a fairly
simple refcounting game.
27 files changed, 717 insertions, 657 deletions
diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst index 61670e7df2f7..ff6b110291bc 100644 --- a/Documentation/arch/x86/tdx.rst +++ b/Documentation/arch/x86/tdx.rst @@ -60,44 +60,18 @@ Besides initializing the TDX module, a per-cpu initialization SEAMCALL must be done on one cpu before any other SEAMCALLs can be made on that cpu. -The kernel provides two functions, tdx_enable() and tdx_cpu_enable() to -allow the user of TDX to enable the TDX module and enable TDX on local -cpu respectively. - -Making SEAMCALL requires VMXON has been done on that CPU. Currently only -KVM implements VMXON. For now both tdx_enable() and tdx_cpu_enable() -don't do VMXON internally (not trivial), but depends on the caller to -guarantee that. - -To enable TDX, the caller of TDX should: 1) temporarily disable CPU -hotplug; 2) do VMXON and tdx_enable_cpu() on all online cpus; 3) call -tdx_enable(). For example:: - - cpus_read_lock(); - on_each_cpu(vmxon_and_tdx_cpu_enable()); - ret = tdx_enable(); - cpus_read_unlock(); - if (ret) - goto no_tdx; - // TDX is ready to use - -And the caller of TDX must guarantee the tdx_cpu_enable() has been -successfully done on any cpu before it wants to run any other SEAMCALL. -A typical usage is do both VMXON and tdx_cpu_enable() in CPU hotplug -online callback, and refuse to online if tdx_cpu_enable() fails. - User can consult dmesg to see whether the TDX module has been initialized. If the TDX module is initialized successfully, dmesg shows something like below:: [..] virt/tdx: 262668 KBs allocated for PAMT - [..] virt/tdx: module initialized + [..] virt/tdx: TDX-Module initialized If the TDX module failed to initialize, dmesg also shows it failed to initialize:: - [..] virt/tdx: module initialization failed ... + [..] virt/tdx: TDX-Module initialization failed ... TDX Interaction to Other Kernel Components ------------------------------------------ @@ -129,9 +103,9 @@ CPU Hotplug ~~~~~~~~~~~ TDX module requires the per-cpu initialization SEAMCALL must be done on -one cpu before any other SEAMCALLs can be made on that cpu. The kernel -provides tdx_cpu_enable() to let the user of TDX to do it when the user -wants to use a new cpu for TDX task. +one cpu before any other SEAMCALLs can be made on that cpu. The kernel, +via the CPU hotplug framework, performs the necessary initialization when +a CPU is first brought online. TDX doesn't support physical (ACPI) CPU hotplug. During machine boot, TDX verifies all boot-time present logical CPUs are TDX compatible before diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 44524a387c58..b5726b50e77d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1591,7 +1591,6 @@ void intel_pt_handle_vmx(int on) local_irq_restore(flags); } -EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx); /* * PMU callbacks diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7cf9b9899f86..809a8a1c2c80 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,7 +40,8 @@ #include <asm/irq_remapping.h> #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> -#include <asm/reboot.h> +#include <asm/virt.h> + #include <hyperv/hvhdk.h> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index ecd58ea9a837..a671a1145906 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,17 +25,6 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -typedef void (cpu_emergency_virt_cb)(void); -#if IS_ENABLED(CONFIG_KVM_X86) -void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); -void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); -void cpu_emergency_disable_virtualization(void); -#else -static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {} -static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {} -static inline void cpu_emergency_disable_virtualization(void) {} -#endif /* CONFIG_KVM_X86 */ - typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); void run_crash_ipi_callback(struct pt_regs *regs); diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 6b338d7f01b7..a149740b24e8 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -145,8 +145,6 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn, #define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) #define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) #define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) -int tdx_cpu_enable(void); -int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); const struct tdx_sys_info *tdx_get_sysinfo(void); @@ -223,8 +221,6 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); #else static inline void tdx_init(void) { } -static inline int tdx_cpu_enable(void) { return -ENODEV; } -static inline int tdx_enable(void) { return -ENODEV; } static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } diff --git a/arch/x86/include/asm/virt.h b/arch/x86/include/asm/virt.h new file mode 100644 index 000000000000..1558a0673d06 --- /dev/null +++ b/arch/x86/include/asm/virt.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_X86_VIRT_H +#define _ASM_X86_VIRT_H + +#include <asm/reboot.h> + +typedef void (cpu_emergency_virt_cb)(void); + +#if IS_ENABLED(CONFIG_KVM_X86) +extern bool virt_rebooting; + +void __init x86_virt_init(void); + +int x86_virt_get_ref(int feat); +void x86_virt_put_ref(int feat); + +int x86_virt_emergency_disable_virtualization_cpu(void); + +void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback); +void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback); +#else +static __always_inline void x86_virt_init(void) {} +static inline int x86_virt_emergency_disable_virtualization_cpu(void) { return -ENOENT; } +#endif + +#endif /* _ASM_X86_VIRT_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b92ff87e3560..37080382df54 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -20,6 +20,17 @@ #include <asm/trapnr.h> #include <asm/vmxfeatures.h> +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + +struct vmcs { + struct vmcs_hdr hdr; + u32 abort; + char data[]; +}; + #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ec0670114efa..e0b19c54a7ef 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -71,6 +71,7 @@ #include <asm/traps.h> #include <asm/sev.h> #include <asm/tdx.h> +#include <asm/virt.h> #include <asm/posted_intr.h> #include <asm/runtime-const.h> @@ -2161,6 +2162,7 @@ static __init void identify_boot_cpu(void) cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + x86_virt_init(); tsx_init(); tdx_init(); lkgs_init(); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 335fd2ee9766..cd796818d94d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -42,6 +42,7 @@ #include <asm/crash.h> #include <asm/cmdline.h> #include <asm/sev.h> +#include <asm/virt.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -111,7 +112,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); - cpu_emergency_disable_virtualization(); + x86_virt_emergency_disable_virtualization_cpu(); /* * Disable Intel PT to stop its logging diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 6032fa9ec753..0bab8863375a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -27,6 +27,7 @@ #include <asm/cpu.h> #include <asm/nmi.h> #include <asm/smp.h> +#include <asm/virt.h> #include <linux/ctype.h> #include <linux/mc146818rtc.h> @@ -532,51 +533,6 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); #if IS_ENABLED(CONFIG_KVM_X86) -/* RCU-protected callback to disable virtualization prior to reboot. */ -static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; - -void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) -{ - if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) - return; - - rcu_assign_pointer(cpu_emergency_virt_callback, callback); -} -EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback); - -void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) -{ - if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) - return; - - rcu_assign_pointer(cpu_emergency_virt_callback, NULL); - synchronize_rcu(); -} -EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback); - -/* - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if - * GIF=0, i.e. if the crash occurred between CLGI and STGI. - */ -void cpu_emergency_disable_virtualization(void) -{ - cpu_emergency_virt_cb *callback; - - /* - * IRQs must be disabled as KVM enables virtualization in hardware via - * function call IPIs, i.e. IRQs need to be disabled to guarantee - * virtualization stays disabled. - */ - lockdep_assert_irqs_disabled(); - - rcu_read_lock(); - callback = rcu_dereference(cpu_emergency_virt_callback); - if (callback) - callback(); - rcu_read_unlock(); -} - static void emergency_reboot_disable_virtualization(void) { local_irq_disable(); @@ -588,16 +544,11 @@ static void emergency_reboot_disable_virtualization(void) * We can't take any locks and we may be on an inconsistent state, so * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. * - * Do the NMI shootdown even if virtualization is off on _this_ CPU, as - * other CPUs may have virtualization enabled. + * Safely force _this_ CPU out of VMX/SVM operation, and if necessary, + * blast NMIs to force other CPUs out of VMX/SVM as well.k */ - if (rcu_access_pointer(cpu_emergency_virt_callback)) { - /* Safely force _this_ CPU out of VMX/SVM operation. */ - cpu_emergency_disable_virtualization(); - - /* Disable VMX/SVM and halt on other CPUs. */ + if (!x86_virt_emergency_disable_virtualization_cpu()) nmi_shootdown_cpus_on_restart(); - } } #else static void emergency_reboot_disable_virtualization(void) { } @@ -875,10 +826,10 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) shootdown_callback(cpu, regs); /* - * Prepare the CPU for reboot _after_ invoking the callback so that the - * callback can safely use virtualization instructions, e.g. VMCLEAR. + * Disable virtualization, as both VMX and SVM can block INIT and thus + * prevent AP bringup, e.g. in a kdump kernel or in firmware. */ - cpu_emergency_disable_virtualization(); + x86_virt_emergency_disable_virtualization_cpu(); atomic_dec(&waiting_for_crash_ipi); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b014e6d229f9..cbf95fe2b207 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -35,6 +35,7 @@ #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> #include <asm/reboot.h> +#include <asm/virt.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -124,7 +125,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; - cpu_emergency_disable_virtualization(); + x86_virt_emergency_disable_virtualization_cpu(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -136,7 +137,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { apic_eoi(); - cpu_emergency_disable_virtualization(); + x86_virt_emergency_disable_virtualization_cpu(); stop_this_cpu(NULL); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 07ed964dacf5..9a9e081e9554 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -44,6 +44,7 @@ #include <asm/traps.h> #include <asm/reboot.h> #include <asm/fpu/api.h> +#include <asm/virt.h> #include <trace/events/ipi.h> @@ -493,27 +494,9 @@ static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm return &sd->save_area->host_sev_es_save; } -static inline void kvm_cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrq(MSR_VM_HSAVE_PA, 0); - rdmsrq(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and - * NMI aren't blocked. - */ - stgi(); - wrmsrq(MSR_EFER, efer & ~EFER_SVME); - } -} - static void svm_emergency_disable_virtualization_cpu(void) { - kvm_rebooting = true; - - kvm_cpu_svm_disable(); + wrmsrq(MSR_VM_HSAVE_PA, 0); } static void svm_disable_virtualization_cpu(void) @@ -522,7 +505,8 @@ static void svm_disable_virtualization_cpu(void) if (tsc_scaling) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); - kvm_cpu_svm_disable(); + x86_virt_put_ref(X86_FEATURE_SVM); + wrmsrq(MSR_VM_HSAVE_PA, 0); amd_pmu_disable_virt(); } @@ -531,12 +515,12 @@ static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; - uint64_t efer; int me = raw_smp_processor_id(); + int r; - rdmsrq(MSR_EFER, efer); - if (efer & EFER_SVME) - return -EBUSY; + r = x86_virt_get_ref(X86_FEATURE_SVM); + if (r) + return r; sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; @@ -544,8 +528,6 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { @@ -556,7 +538,6 @@ static int svm_enable_virtualization_cpu(void) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); } - /* * Get OSVW bits. * diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 3392bcadfb89..d47c5c93c991 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -298,16 +298,16 @@ SYM_FUNC_START(__svm_vcpu_run) RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) -10: cmpb $0, _ASM_RIP(kvm_rebooting) +10: cmpb $0, _ASM_RIP(virt_rebooting) jne 2b ud2 -30: cmpb $0, _ASM_RIP(kvm_rebooting) +30: cmpb $0, _ASM_RIP(virt_rebooting) jne 4b ud2 -50: cmpb $0, _ASM_RIP(kvm_rebooting) +50: cmpb $0, _ASM_RIP(virt_rebooting) jne 6b ud2 -70: cmpb $0, _ASM_RIP(kvm_rebooting) +70: cmpb $0, _ASM_RIP(virt_rebooting) jne 8b ud2 @@ -394,7 +394,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY %sil -3: cmpb $0, kvm_rebooting(%rip) +3: cmpb $0, virt_rebooting(%rip) jne 2b ud2 diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index a46ccd670785..dbebddf648be 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -29,10 +29,15 @@ static __init int vt_hardware_setup(void) if (ret) return ret; + return enable_tdx ? tdx_hardware_setup() : 0; +} + +static void vt_hardware_unsetup(void) +{ if (enable_tdx) - tdx_hardware_setup(); + tdx_hardware_unsetup(); - return 0; + vmx_hardware_unsetup(); } static int vt_vm_init(struct kvm *kvm) @@ -869,7 +874,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .check_processor_compatibility = vmx_check_processor_compat, - .hardware_unsetup = vmx_hardware_unsetup, + .hardware_unsetup = vt_op(hardware_unsetup), .enable_virtualization_cpu = vmx_enable_virtualization_cpu, .disable_virtualization_cpu = vt_op(disable_virtualization_cpu), @@ -1029,7 +1034,6 @@ struct kvm_x86_init_ops vt_init_ops __initdata = { static void __exit vt_exit(void) { kvm_exit(); - tdx_cleanup(); vmx_exit(); } module_exit(vt_exit); @@ -1043,11 +1047,6 @@ static int __init vt_init(void) if (r) return r; - /* tdx_init() has been taken */ - r = tdx_bringup(); - if (r) - goto err_tdx_bringup; - /* * TDX and VMX have different vCPU structures. Calculate the * maximum size/align so that kvm_init() can use the larger @@ -1074,8 +1073,6 @@ static int __init vt_init(void) return 0; err_kvm_init: - tdx_cleanup(); -err_tdx_bringup: vmx_exit(); return r; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5e9b0c4d9af6..1e47c194af53 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -6,6 +6,7 @@ #include <linux/misc_cgroup.h> #include <linux/mmu_context.h> #include <asm/tdx.h> +#include <asm/virt.h> #include "capabilities.h" #include "mmu.h" #include "x86_ops.h" @@ -58,8 +59,6 @@ module_param_named(tdx, enable_tdx, bool, 0444); #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) -static enum cpuhp_state tdx_cpuhp_state; - static const struct tdx_sys_info *tdx_sysinfo; void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) @@ -218,8 +217,6 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, */ static DEFINE_MUTEX(tdx_lock); -static atomic_t nr_configured_hkid; - static bool tdx_operand_busy(u64 err) { return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; @@ -267,7 +264,6 @@ static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) { tdx_guest_keyid_free(kvm_tdx->hkid); kvm_tdx->hkid = -1; - atomic_dec(&nr_configured_hkid); misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); put_misc_cg(kvm_tdx->misc_cg); kvm_tdx->misc_cg = NULL; @@ -1988,7 +1984,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) * TDX_SEAMCALL_VMFAILINVALID. */ if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { - KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); + KVM_BUG_ON(!virt_rebooting, vcpu->kvm); goto unhandled_exit; } @@ -2391,8 +2387,6 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, ret = -ENOMEM; - atomic_inc(&nr_configured_hkid); - tdr_page = alloc_page(GFP_KERNEL); if (!tdr_page) goto free_hkid; @@ -3284,106 +3278,15 @@ int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) return PG_LEVEL_4K; } -static int tdx_online_cpu(unsigned int cpu) -{ - unsigned long flags; - int r; - - /* Sanity check CPU is already in post-VMXON */ - WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); - - local_irq_save(flags); - r = tdx_cpu_enable(); - local_irq_restore(flags); - - return r; -} - -static int tdx_offline_cpu(unsigned int cpu) -{ - int i; - - /* No TD is running. Allow any cpu to be offline. */ - if (!atomic_read(&nr_configured_hkid)) - return 0; - - /* - * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to - * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory - * controller with pconfig. If we have active TDX HKID, refuse to - * offline the last online cpu. - */ - for_each_online_cpu(i) { - /* - * Found another online cpu on the same package. - * Allow to offline. - */ - if (i != cpu && topology_physical_package_id(i) == - topology_physical_package_id(cpu)) - return 0; - } - - /* - * This is the last cpu of this package. Don't offline it. - * - * Because it's hard for human operator to understand the - * reason, warn it. - */ -#define MSG_ALLPKG_ONLINE \ - "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" - pr_warn_ratelimited(MSG_ALLPKG_ONLINE); - return -EBUSY; -} - -static void __do_tdx_cleanup(void) -{ - /* - * Once TDX module is initialized, it cannot be disabled and - * re-initialized again w/o runtime update (which isn't - * supported by kernel). Only need to remove the cpuhp here. - * The TDX host core code tracks TDX status and can handle - * 'multiple enabling' scenario. - */ - WARN_ON_ONCE(!tdx_cpuhp_state); - cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); - tdx_cpuhp_state = 0; -} - -static void __tdx_cleanup(void) -{ - cpus_read_lock(); - __do_tdx_cleanup(); - cpus_read_unlock(); -} - -static int __init __do_tdx_bringup(void) +void tdx_hardware_unsetup(void) { - int r; - - /* - * TDX-specific cpuhp callback to call tdx_cpu_enable() on all - * online CPUs before calling tdx_enable(), and on any new - * going-online CPU to make sure it is ready for TDX guest. - */ - r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, - "kvm/cpu/tdx:online", - tdx_online_cpu, tdx_offline_cpu); - if (r < 0) - return r; - - tdx_cpuhp_state = r; - - r = tdx_enable(); - if (r) - __do_tdx_cleanup(); - - return r; + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); } -static int __init __tdx_bringup(void) +static int __init __tdx_hardware_setup(void) { const struct tdx_sys_info_td_conf *td_conf; - int r, i; + int i; for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { /* @@ -3399,34 +3302,18 @@ static int __init __tdx_bringup(void) } } - /* - * Enabling TDX requires enabling hardware virtualization first, - * as making SEAMCALLs requires CPU being in post-VMXON state. - */ - r = kvm_enable_virtualization(); - if (r) - return r; - - cpus_read_lock(); - r = __do_tdx_bringup(); - cpus_read_unlock(); - - if (r) - goto tdx_bringup_err; - - r = -EINVAL; /* Get TDX global information for later use */ tdx_sysinfo = tdx_get_sysinfo(); - if (WARN_ON_ONCE(!tdx_sysinfo)) - goto get_sysinfo_err; + if (!tdx_sysinfo) + return -ENODEV; /* Check TDX module and KVM capabilities */ if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) - goto get_sysinfo_err; + return -EINVAL; if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) - goto get_sysinfo_err; + return -EINVAL; /* * TDX has its own limit of maximum vCPUs it can support for all @@ -3461,35 +3348,16 @@ static int __init __tdx_bringup(void) if (td_conf->max_vcpus_per_td < num_present_cpus()) { pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", td_conf->max_vcpus_per_td, num_present_cpus()); - goto get_sysinfo_err; + return -EINVAL; } if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) - goto get_sysinfo_err; + return -EINVAL; - /* - * Leave hardware virtualization enabled after TDX is enabled - * successfully. TDX CPU hotplug depends on this. - */ return 0; - -get_sysinfo_err: - __tdx_cleanup(); -tdx_bringup_err: - kvm_disable_virtualization(); - return r; } -void tdx_cleanup(void) -{ - if (enable_tdx) { - misc_cg_set_capacity(MISC_CG_RES_TDX, 0); - __tdx_cleanup(); - kvm_disable_virtualization(); - } -} - -int __init tdx_bringup(void) +int __init tdx_hardware_setup(void) { int r, i; @@ -3520,40 +3388,12 @@ int __init tdx_bringup(void) goto success_disable_tdx; } - if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { - pr_err("tdx: MOVDIR64B is required for TDX\n"); - goto success_disable_tdx; - } - - if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { - pr_err("Self-snoop is required for TDX\n"); - goto success_disable_tdx; - } - if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { - pr_err("tdx: no TDX private KeyIDs available\n"); + pr_err("TDX not supported by the host platform\n"); goto success_disable_tdx; } - if (!enable_virt_at_load) { - pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); - goto success_disable_tdx; - } - - /* - * Ideally KVM should probe whether TDX module has been loaded - * first and then try to bring it up. But TDX needs to use SEAMCALL - * to probe whether the module is loaded (there is no CPUID or MSR - * for that), and making SEAMCALL requires enabling virtualization - * first, just like the rest steps of bringing up TDX module. - * - * So, for simplicity do everything in __tdx_bringup(); the first - * SEAMCALL will return -ENODEV when the module is not loaded. The - * only complication is having to make sure that initialization - * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other - * cases. - */ - r = __tdx_bringup(); + r = __tdx_hardware_setup(); if (r) { /* * Disable TDX only but don't fail to load module if the TDX @@ -3568,24 +3408,11 @@ int __init tdx_bringup(void) if (r == -ENODEV) goto success_disable_tdx; - enable_tdx = 0; + return r; } - return r; - -success_disable_tdx: - enable_tdx = 0; - return 0; -} - -void __init tdx_hardware_setup(void) -{ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); - /* - * Note, if the TDX module can't be loaded, KVM TDX support will be - * disabled but KVM will continue loading (see tdx_bringup()). - */ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; @@ -3593,4 +3420,9 @@ void __init tdx_hardware_setup(void) vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; + return 0; + +success_disable_tdx: + enable_tdx = 0; + return 0; } diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 45b5183ccb36..b5cd2ffb303e 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -8,9 +8,8 @@ #ifdef CONFIG_KVM_INTEL_TDX #include "common.h" -void tdx_hardware_setup(void); -int tdx_bringup(void); -void tdx_cleanup(void); +int tdx_hardware_setup(void); +void tdx_hardware_unsetup(void); extern bool enable_tdx; @@ -187,9 +186,6 @@ TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); #else -static inline int tdx_bringup(void) { return 0; } -static inline void tdx_cleanup(void) {} - #define enable_tdx 0 struct kvm_tdx { diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 66d747e265b1..1f16ddeae9cb 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -22,17 +22,6 @@ #define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10) #define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) -struct vmcs_hdr { - u32 revision_id:31; - u32 shadow_vmcs:1; -}; - -struct vmcs { - struct vmcs_hdr hdr; - u32 abort; - char data[]; -}; - DECLARE_PER_CPU(struct vmcs *, current_vmcs); /* diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 4426d34811fc..8a481dae9cae 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -310,7 +310,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) RET .Lfixup: - cmpb $0, _ASM_RIP(kvm_rebooting) + cmpb $0, _ASM_RIP(virt_rebooting) jne .Lvmfail ud2 .Lvmfail: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d76a21c38506..a29896a9ef14 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -48,6 +48,7 @@ #include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> +#include <asm/virt.h> #include <asm/vmx.h> #include <trace/events/ipi.h> @@ -579,7 +580,6 @@ noinline void invept_error(unsigned long ext, u64 eptp) vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); } -static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed @@ -786,53 +786,17 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } -/* - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static int kvm_cpu_vmxoff(void) -{ - asm goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; - kvm_rebooting = true; - - /* - * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be - * set in task context. If this races with VMX is disabled by an NMI, - * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to - * kvm_rebooting set. - */ - if (!(__read_cr4() & X86_CR4_VMXE)) - return; - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); if (v->shadow_vmcs) vmcs_clear(v->shadow_vmcs); } - - kvm_cpu_vmxoff(); } static void __loaded_vmcs_clear(void *arg) @@ -2927,12 +2891,16 @@ static bool __kvm_is_vmx_supported(void) return false; } - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL)) { pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); return false; } + if (!this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not fully enabled on CPU %d. Check kernel logs and/or BIOS\n", cpu); + return false; + } + return true; } @@ -2984,34 +2952,9 @@ int vmx_check_processor_compat(void) return 0; } -static int kvm_cpu_vmxon(u64 vmxon_pointer) -{ - u64 msr; - - cr4_set_bits(X86_CR4_VMXE); - - asm goto("1: vmxon %[vmxon_pointer]\n\t" - _ASM_EXTABLE(1b, %l[fault]) - : : [vmxon_pointer] "m"(vmxon_pointer) - : : fault); - return 0; - -fault: - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - cr4_clear_bits(X86_CR4_VMXE); - - return -EFAULT; -} - int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - int r; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; /* * This can happen if we hot-added a CPU but failed to allocate @@ -3020,15 +2963,7 @@ int vmx_enable_virtualization_cpu(void) if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) return -EFAULT; - intel_pt_handle_vmx(1); - - r = kvm_cpu_vmxon(phys_addr); - if (r) { - intel_pt_handle_vmx(0); - return r; - } - - return 0; + return x86_virt_get_ref(X86_FEATURE_VMX); } static void vmclear_local_loaded_vmcss(void) @@ -3045,12 +2980,9 @@ void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); - if (kvm_cpu_vmxoff()) - kvm_spurious_fault(); + x86_virt_put_ref(X86_FEATURE_VMX); hv_reset_evmcs(); - - intel_pt_handle_vmx(0); } struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) @@ -3128,47 +3060,6 @@ out_vmcs: return -ENOMEM; } -static void free_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - free_vmcs(per_cpu(vmxarea, cpu)); - per_cpu(vmxarea, cpu) = NULL; - } -} - -static __init int alloc_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct vmcs *vmcs; - - vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); - if (!vmcs) { - free_kvm_area(); - return -ENOMEM; - } - - /* - * When eVMCS is enabled, alloc_vmcs_cpu() sets - * vmcs->revision_id to KVM_EVMCS_VERSION instead of - * revision_id reported by MSR_IA32_VMX_BASIC. - * - * However, even though not explicitly documented by - * TLFS, VMXArea passed as VMXON argument should - * still be marked with revision_id reported by - * physical CPU. - */ - if (kvm_is_using_evmcs()) - vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); - - per_cpu(vmxarea, cpu) = vmcs; - } - return 0; -} - static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -8569,8 +8460,6 @@ void vmx_hardware_unsetup(void) if (nested) nested_vmx_hardware_unsetup(); - - free_kvm_area(); } void vmx_vm_destroy(struct kvm *kvm) @@ -8869,10 +8758,6 @@ __init int vmx_hardware_setup(void) return r; } - r = alloc_kvm_area(); - if (r) - goto err_kvm_area; - kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); /* @@ -8899,11 +8784,6 @@ __init int vmx_hardware_setup(void) kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return 0; - -err_kvm_area: - if (nested) - nested_vmx_hardware_unsetup(); - return r; } void vmx_exit(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d65edcf8f30d..68691163627c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -83,6 +83,8 @@ #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <asm/sgx.h> +#include <asm/virt.h> + #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS @@ -713,7 +715,7 @@ static void drop_user_return_notifiers(void) noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - BUG_ON(!kvm_rebooting); + BUG_ON(!virt_rebooting); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault); @@ -13125,12 +13127,12 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector); void kvm_arch_enable_virtualization(void) { - cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); + x86_virt_register_emergency_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); } void kvm_arch_disable_virtualization(void) { - cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); + x86_virt_unregister_emergency_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); } int kvm_arch_enable_virtualization_cpu(void) @@ -13229,6 +13231,25 @@ int kvm_arch_enable_virtualization_cpu(void) return 0; } +void kvm_arch_shutdown(void) +{ + /* + * Set virt_rebooting to indicate that KVM has asynchronously disabled + * hardware virtualization, i.e. that errors and/or exceptions on SVM + * and VMX instructions are expected and should be ignored. + */ + virt_rebooting = true; + + /* + * Ensure virt_rebooting is visible before IPIs are sent to other CPUs + * to disable virtualization. Effectively pairs with the reception of + * the IPI (virt_rebooting is read in task/exception context, but only + * _needs_ to be read as %true after the IPI function callback disables + * virtualization). + */ + smp_wmb(); +} + void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(disable_virtualization_cpu)(); @@ -13243,7 +13264,7 @@ void kvm_arch_disable_virtualization_cpu(void) * disable virtualization arrives. Handle the extreme edge case here * instead of trying to account for it in the normal flows. */ - if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) + if (in_task() || WARN_ON_ONCE(!virt_rebooting)) drop_user_return_notifiers(); else __module_get(THIS_MODULE); diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile index ea343fc392dc..6e485751650c 100644 --- a/arch/x86/virt/Makefile +++ b/arch/x86/virt/Makefile @@ -1,2 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += svm/ vmx/ + +obj-$(subst m,y,$(CONFIG_KVM_X86)) += hw.o
\ No newline at end of file diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c new file mode 100644 index 000000000000..f647557d38ac --- /dev/null +++ b/arch/x86/virt/hw.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/errno.h> +#include <linux/kvm_types.h> +#include <linux/list.h> +#include <linux/percpu.h> + +#include <asm/perf_event.h> +#include <asm/processor.h> +#include <asm/virt.h> +#include <asm/vmx.h> + +struct x86_virt_ops { + int feature; + int (*enable_virtualization_cpu)(void); + int (*disable_virtualization_cpu)(void); + void (*emergency_disable_virtualization_cpu)(void); +}; +static struct x86_virt_ops virt_ops __ro_after_init; + +__visible bool virt_rebooting; +EXPORT_SYMBOL_FOR_KVM(virt_rebooting); + +static DEFINE_PER_CPU(int, virtualization_nr_users); + +static cpu_emergency_virt_cb __rcu *kvm_emergency_callback; + +void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback))) + return; + + rcu_assign_pointer(kvm_emergency_callback, callback); +} +EXPORT_SYMBOL_FOR_KVM(x86_virt_register_emergency_callback); + +void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback)) + return; + + rcu_assign_pointer(kvm_emergency_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_FOR_KVM(x86_virt_unregister_emergency_callback); + +static void x86_virt_invoke_kvm_emergency_callback(void) +{ + cpu_emergency_virt_cb *kvm_callback; + + kvm_callback = rcu_dereference(kvm_emergency_callback); + if (kvm_callback) + kvm_callback(); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static DEFINE_PER_CPU(struct vmcs *, root_vmcs); + +static int x86_virt_cpu_vmxon(void) +{ + u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id())); + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int x86_vmx_enable_virtualization_cpu(void) +{ + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + intel_pt_handle_vmx(1); + + r = x86_virt_cpu_vmxon(); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + return 0; +} + +/* + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) + * + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to + * atomically track post-VMXON state, e.g. this may be called in NMI context. + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is + * magically in RM, VM86, compat mode, or at CPL>0. + */ +static int x86_vmx_disable_virtualization_cpu(void) +{ + int r = -EIO; + + asm goto("1: vmxoff\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "cc", "memory" : fault); + r = 0; + +fault: + cr4_clear_bits(X86_CR4_VMXE); + intel_pt_handle_vmx(0); + return r; +} + +static void x86_vmx_emergency_disable_virtualization_cpu(void) +{ + virt_rebooting = true; + + /* + * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be + * set in task context. If this races with _another_ emergency call + * from NMI context, VMCLEAR (in KVM) and VMXOFF may #UD, but KVM and + * the kernel will eat those faults due to virt_rebooting being set by + * the interrupting NMI callback. + */ + if (!(__read_cr4() & X86_CR4_VMXE)) + return; + + x86_virt_invoke_kvm_emergency_callback(); + + x86_vmx_disable_virtualization_cpu(); +} + +static __init void x86_vmx_exit(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + free_page((unsigned long)per_cpu(root_vmcs, cpu)); + per_cpu(root_vmcs, cpu) = NULL; + } +} + +static __init int __x86_vmx_init(void) +{ + const struct x86_virt_ops vmx_ops = { + .feature = X86_FEATURE_VMX, + .enable_virtualization_cpu = x86_vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = x86_vmx_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = x86_vmx_emergency_disable_virtualization_cpu, + }; + + u64 basic_msr; + u32 rev_id; + int cpu; + + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -EOPNOTSUPP; + + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)) + return -EIO; + + /* + * Even if eVMCS is enabled (or will be enabled?), and even though not + * explicitly documented by TLFS, the root VMCS passed to VMXON should + * still be marked with the revision_id reported by the physical CPU. + */ + rev_id = vmx_basic_vmcs_revision_id(basic_msr); + + for_each_possible_cpu(cpu) { + int node = cpu_to_node(cpu); + struct page *page; + struct vmcs *vmcs; + + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (WARN_ON_ONCE(!page)) { + x86_vmx_exit(); + return -ENOMEM; + } + + vmcs = page_address(page); + vmcs->hdr.revision_id = rev_id; + per_cpu(root_vmcs, cpu) = vmcs; + } + + memcpy(&virt_ops, &vmx_ops, sizeof(virt_ops)); + return 0; +} + +static __init int x86_vmx_init(void) +{ + int r; + + r = __x86_vmx_init(); + if (r) + setup_clear_cpu_cap(X86_FEATURE_VMX); + return r; +} +#else +static __init int x86_vmx_init(void) { return -EOPNOTSUPP; } +static __init void x86_vmx_exit(void) { } +#endif + +#if IS_ENABLED(CONFIG_KVM_AMD) +static int x86_svm_enable_virtualization_cpu(void) +{ + u64 efer; + + rdmsrq(MSR_EFER, efer); + if (efer & EFER_SVME) + return -EBUSY; + + wrmsrq(MSR_EFER, efer | EFER_SVME); + return 0; +} + +static int x86_svm_disable_virtualization_cpu(void) +{ + int r = -EIO; + u64 efer; + + /* + * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and + * NMI aren't blocked. + */ + asm goto("1: stgi\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "memory" : fault); + r = 0; + +fault: + rdmsrq(MSR_EFER, efer); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); + return r; +} + +static void x86_svm_emergency_disable_virtualization_cpu(void) +{ + u64 efer; + + virt_rebooting = true; + + rdmsrq(MSR_EFER, efer); + if (!(efer & EFER_SVME)) + return; + + x86_virt_invoke_kvm_emergency_callback(); + + x86_svm_disable_virtualization_cpu(); +} + +static __init int x86_svm_init(void) +{ + const struct x86_virt_ops svm_ops = { + .feature = X86_FEATURE_SVM, + .enable_virtualization_cpu = x86_svm_enable_virtualization_cpu, + .disable_virtualization_cpu = x86_svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = x86_svm_emergency_disable_virtualization_cpu, + }; + + if (!cpu_feature_enabled(X86_FEATURE_SVM) || + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return -EOPNOTSUPP; + + memcpy(&virt_ops, &svm_ops, sizeof(virt_ops)); + return 0; +} +#else +static __init int x86_svm_init(void) { return -EOPNOTSUPP; } +#endif + +int x86_virt_get_ref(int feat) +{ + int r; + + /* Ensure the !feature check can't get false positives. */ + BUILD_BUG_ON(!X86_FEATURE_SVM || !X86_FEATURE_VMX); + + if (!virt_ops.feature || virt_ops.feature != feat) + return -EOPNOTSUPP; + + guard(preempt)(); + + if (this_cpu_inc_return(virtualization_nr_users) > 1) + return 0; + + r = virt_ops.enable_virtualization_cpu(); + if (r) + WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users)); + + return r; +} +EXPORT_SYMBOL_FOR_KVM(x86_virt_get_ref); + +void x86_virt_put_ref(int feat) +{ + guard(preempt)(); + + if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) || + this_cpu_dec_return(virtualization_nr_users)) + return; + + BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting); +} +EXPORT_SYMBOL_FOR_KVM(x86_virt_put_ref); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +int x86_virt_emergency_disable_virtualization_cpu(void) +{ + if (!virt_ops.feature) + return -EOPNOTSUPP; + + /* + * IRQs must be disabled as virtualization is enabled in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + /* + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. + * + * TODO: Track whether or not virtualization might be enabled on other + * CPUs? May not be worth avoiding the NMI shootdown... + */ + virt_ops.emergency_disable_virtualization_cpu(); + return 0; +} + +void __init x86_virt_init(void) +{ + /* + * Attempt to initialize both SVM and VMX, and simply use whichever one + * is present. Rsefuse to enable/use SVM or VMX if both are somehow + * supported. No known CPU supports both SVM and VMX. + */ + bool has_vmx = !x86_vmx_init(); + bool has_svm = !x86_svm_init(); + + if (WARN_ON_ONCE(has_vmx && has_svm)) { + x86_vmx_exit(); + memset(&virt_ops, 0, sizeof(virt_ops)); + } +} diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 8b8e165a2001..cb9b3210ab71 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -28,6 +28,7 @@ #include <linux/log2.h> #include <linux/acpi.h> #include <linux/suspend.h> +#include <linux/syscore_ops.h> #include <linux/idr.h> #include <linux/kvm_types.h> #include <asm/page.h> @@ -39,6 +40,7 @@ #include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/mce.h> +#include <asm/virt.h> #include "tdx.h" static u32 tdx_global_keyid __ro_after_init; @@ -51,13 +53,11 @@ static DEFINE_PER_CPU(bool, tdx_lp_initialized); static struct tdmr_info_list tdx_tdmr_list; -static enum tdx_module_status_t tdx_module_status; -static DEFINE_MUTEX(tdx_module_lock); - /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ static LIST_HEAD(tdx_memlist); -static struct tdx_sys_info tdx_sysinfo; +static struct tdx_sys_info tdx_sysinfo __ro_after_init; +static bool tdx_module_initialized __ro_after_init; typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); @@ -106,8 +106,7 @@ static __always_inline int sc_retry_prerr(sc_func_t func, /* * Do the module global initialization once and return its result. - * It can be done on any cpu. It's always called with interrupts - * disabled. + * It can be done on any cpu, and from task or IRQ context. */ static int try_init_module_global(void) { @@ -116,8 +115,6 @@ static int try_init_module_global(void) static bool sysinit_done; static int sysinit_ret; - lockdep_assert_irqs_disabled(); - raw_spin_lock(&sysinit_lock); if (sysinit_done) @@ -142,26 +139,15 @@ out: } /** - * tdx_cpu_enable - Enable TDX on local cpu - * - * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module - * global initialization SEAMCALL if not done) on local cpu to make this - * cpu be ready to run any other SEAMCALLs. - * - * Always call this function via IPI function calls. - * - * Return 0 on success, otherwise errors. + * Enable VMXON and then do one-time TDX module per-cpu initialization SEAMCALL + * (and TDX module global initialization SEAMCALL if not done) on local cpu to + * make this cpu be ready to run any other SEAMCALLs. */ -int tdx_cpu_enable(void) +static int tdx_cpu_enable(void) { struct tdx_module_args args = {}; int ret; - if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) - return -ENODEV; - - lockdep_assert_irqs_disabled(); - if (__this_cpu_read(tdx_lp_initialized)) return 0; @@ -182,15 +168,101 @@ int tdx_cpu_enable(void) return 0; } -EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable); + +static int tdx_online_cpu(unsigned int cpu) +{ + int ret; + + ret = x86_virt_get_ref(X86_FEATURE_VMX); + if (ret) + return ret; + + ret = tdx_cpu_enable(); + if (ret) + x86_virt_put_ref(X86_FEATURE_VMX); + + return ret; +} + +static int tdx_offline_cpu(unsigned int cpu) +{ + int i; + + /* No TD is running. Allow any cpu to be offline. */ + if (ida_is_empty(&tdx_guest_keyid_pool)) + goto done; + + /* + * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to + * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory + * controller with pconfig. If we have active TDX HKID, refuse to + * offline the last online cpu. + */ + for_each_online_cpu(i) { + /* + * Found another online cpu on the same package. + * Allow to offline. + */ + if (i != cpu && topology_physical_package_id(i) == + topology_physical_package_id(cpu)) + goto done; + } + + /* + * This is the last cpu of this package. Don't offline it. + * + * Because it's hard for human operator to understand the + * reason, warn it. + */ +#define MSG_ALLPKG_ONLINE \ + "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" + pr_warn_ratelimited(MSG_ALLPKG_ONLINE); + return -EBUSY; + +done: + x86_virt_put_ref(X86_FEATURE_VMX); + return 0; +} + +static void tdx_shutdown_cpu(void *ign) +{ + x86_virt_put_ref(X86_FEATURE_VMX); +} + +static void tdx_shutdown(void *ign) +{ + on_each_cpu(tdx_shutdown_cpu, NULL, 1); +} + +static int tdx_suspend(void *ign) +{ + x86_virt_put_ref(X86_FEATURE_VMX); + return 0; +} + +static void tdx_resume(void *ign) +{ + WARN_ON_ONCE(x86_virt_get_ref(X86_FEATURE_VMX)); +} + +static const struct syscore_ops tdx_syscore_ops = { + .suspend = tdx_suspend, + .resume = tdx_resume, + .shutdown = tdx_shutdown, +}; + +static struct syscore tdx_syscore = { + .ops = &tdx_syscore_ops, +}; /* * Add a memory region as a TDX memory block. The caller must make sure * all memory regions are added in address ascending order and don't * overlap. */ -static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, - unsigned long end_pfn, int nid) +static __init int add_tdx_memblock(struct list_head *tmb_list, + unsigned long start_pfn, + unsigned long end_pfn, int nid) { struct tdx_memblock *tmb; @@ -208,7 +280,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, return 0; } -static void free_tdx_memlist(struct list_head *tmb_list) +static __init void free_tdx_memlist(struct list_head *tmb_list) { /* @tmb_list is protected by mem_hotplug_lock */ while (!list_empty(tmb_list)) { @@ -226,7 +298,7 @@ static void free_tdx_memlist(struct list_head *tmb_list) * ranges off in a secondary structure because memblock is modified * in memory hotplug while TDX memory regions are fixed. */ -static int build_tdx_memlist(struct list_head *tmb_list) +static __init int build_tdx_memlist(struct list_head *tmb_list) { unsigned long start_pfn, end_pfn; int i, nid, ret; @@ -258,7 +330,7 @@ err: return ret; } -static int read_sys_metadata_field(u64 field_id, u64 *data) +static __init int read_sys_metadata_field(u64 field_id, u64 *data) { struct tdx_module_args args = {}; int ret; @@ -280,7 +352,7 @@ static int read_sys_metadata_field(u64 field_id, u64 *data) #include "tdx_global_metadata.c" -static int check_features(struct tdx_sys_info *sysinfo) +static __init int check_features(struct tdx_sys_info *sysinfo) { u64 tdx_features0 = sysinfo->features.tdx_features0; @@ -293,7 +365,7 @@ static int check_features(struct tdx_sys_info *sysinfo) } /* Calculate the actual TDMR size */ -static int tdmr_size_single(u16 max_reserved_per_tdmr) +static __init int tdmr_size_single(u16 max_reserved_per_tdmr) { int tdmr_sz; @@ -307,8 +379,8 @@ static int tdmr_size_single(u16 max_reserved_per_tdmr) return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); } -static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, - struct tdx_sys_info_tdmr *sysinfo_tdmr) +static __init int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, + struct tdx_sys_info_tdmr *sysinfo_tdmr) { size_t tdmr_sz, tdmr_array_sz; void *tdmr_array; @@ -339,7 +411,7 @@ static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, return 0; } -static void free_tdmr_list(struct tdmr_info_list *tdmr_list) +static __init void free_tdmr_list(struct tdmr_info_list *tdmr_list) { free_pages_exact(tdmr_list->tdmrs, tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); @@ -368,8 +440,8 @@ static inline u64 tdmr_end(struct tdmr_info *tdmr) * preallocated @tdmr_list, following all the special alignment * and size rules for TDMR. */ -static int fill_out_tdmrs(struct list_head *tmb_list, - struct tdmr_info_list *tdmr_list) +static __init int fill_out_tdmrs(struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list) { struct tdx_memblock *tmb; int tdmr_idx = 0; @@ -445,8 +517,8 @@ static int fill_out_tdmrs(struct list_head *tmb_list, * Calculate PAMT size given a TDMR and a page size. The returned * PAMT size is always aligned up to 4K page boundary. */ -static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, - u16 pamt_entry_size) +static __init unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, + u16 pamt_entry_size) { unsigned long pamt_sz, nr_pamt_entries; @@ -477,7 +549,7 @@ static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, * PAMT. This node will have some memory covered by the TDMR. The * relative amount of memory covered is not considered. */ -static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) +static __init int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) { struct tdx_memblock *tmb; @@ -506,9 +578,9 @@ static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) * Allocate PAMTs from the local NUMA node of some memory in @tmb_list * within @tdmr, and set up PAMTs for @tdmr. */ -static int tdmr_set_up_pamt(struct tdmr_info *tdmr, - struct list_head *tmb_list, - u16 pamt_entry_size[]) +static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr, + struct list_head *tmb_list, + u16 pamt_entry_size[]) { unsigned long pamt_base[TDX_PS_NR]; unsigned long pamt_size[TDX_PS_NR]; @@ -578,7 +650,7 @@ static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, *pamt_size = pamt_sz; } -static void tdmr_do_pamt_func(struct tdmr_info *tdmr, +static __init void tdmr_do_pamt_func(struct tdmr_info *tdmr, void (*pamt_func)(unsigned long base, unsigned long size)) { unsigned long pamt_base, pamt_size; @@ -595,17 +667,17 @@ static void tdmr_do_pamt_func(struct tdmr_info *tdmr, pamt_func(pamt_base, pamt_size); } -static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) +static __init void free_pamt(unsigned long pamt_base, unsigned long pamt_size) { free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); } -static void tdmr_free_pamt(struct tdmr_info *tdmr) +static __init void tdmr_free_pamt(struct tdmr_info *tdmr) { tdmr_do_pamt_func(tdmr, free_pamt); } -static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) +static __init void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) { int i; @@ -614,9 +686,9 @@ static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) } /* Allocate and set up PAMTs for all TDMRs */ -static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, - struct list_head *tmb_list, - u16 pamt_entry_size[]) +static __init int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 pamt_entry_size[]) { int i, ret = 0; @@ -665,12 +737,13 @@ void tdx_quirk_reset_page(struct page *page) } EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); -static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) +static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) + { tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); } -static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) +static __init void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) { int i; @@ -678,7 +751,7 @@ static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); } -static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) +static __init unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) { unsigned long pamt_size = 0; int i; @@ -693,8 +766,8 @@ static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) return pamt_size / 1024; } -static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, - u64 size, u16 max_reserved_per_tdmr) +static __init int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, + u64 addr, u64 size, u16 max_reserved_per_tdmr) { struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; int idx = *p_idx; @@ -727,10 +800,10 @@ static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, * those holes fall within @tdmr, set up a TDMR reserved area to cover * the hole. */ -static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, - struct tdmr_info *tdmr, - int *rsvd_idx, - u16 max_reserved_per_tdmr) +static __init int tdmr_populate_rsvd_holes(struct list_head *tmb_list, + struct tdmr_info *tdmr, + int *rsvd_idx, + u16 max_reserved_per_tdmr) { struct tdx_memblock *tmb; u64 prev_end; @@ -791,10 +864,10 @@ static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, * overlaps with @tdmr, set up a TDMR reserved area to cover the * overlapping part. */ -static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, - struct tdmr_info *tdmr, - int *rsvd_idx, - u16 max_reserved_per_tdmr) +static __init int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, + struct tdmr_info *tdmr, + int *rsvd_idx, + u16 max_reserved_per_tdmr) { int i, ret; @@ -829,7 +902,7 @@ static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, } /* Compare function called by sort() for TDMR reserved areas */ -static int rsvd_area_cmp_func(const void *a, const void *b) +static __init int rsvd_area_cmp_func(const void *a, const void *b) { struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; @@ -848,10 +921,10 @@ static int rsvd_area_cmp_func(const void *a, const void *b) * Populate reserved areas for the given @tdmr, including memory holes * (via @tmb_list) and PAMTs (via @tdmr_list). */ -static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, - struct list_head *tmb_list, - struct tdmr_info_list *tdmr_list, - u16 max_reserved_per_tdmr) +static __init int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, + struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list, + u16 max_reserved_per_tdmr) { int ret, rsvd_idx = 0; @@ -876,9 +949,9 @@ static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, * Populate reserved areas for all TDMRs in @tdmr_list, including memory * holes (via @tmb_list) and PAMTs. */ -static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, - struct list_head *tmb_list, - u16 max_reserved_per_tdmr) +static __init int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 max_reserved_per_tdmr) { int i; @@ -899,9 +972,9 @@ static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, * to cover all TDX memory regions in @tmb_list based on the TDX module * TDMR global information in @sysinfo_tdmr. */ -static int construct_tdmrs(struct list_head *tmb_list, - struct tdmr_info_list *tdmr_list, - struct tdx_sys_info_tdmr *sysinfo_tdmr) +static __init int construct_tdmrs(struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list, + struct tdx_sys_info_tdmr *sysinfo_tdmr) { u16 pamt_entry_size[TDX_PS_NR] = { sysinfo_tdmr->pamt_4k_entry_size, @@ -933,7 +1006,8 @@ static int construct_tdmrs(struct list_head *tmb_list, return ret; } -static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) +static __init int config_tdx_module(struct tdmr_info_list *tdmr_list, + u64 global_keyid) { struct tdx_module_args args = {}; u64 *tdmr_pa_array; @@ -968,7 +1042,7 @@ static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) return ret; } -static int do_global_key_config(void *unused) +static __init int do_global_key_config(void *unused) { struct tdx_module_args args = {}; @@ -986,7 +1060,7 @@ static int do_global_key_config(void *unused) * KVM) can ensure success by ensuring sufficient CPUs are online and * can run SEAMCALLs. */ -static int config_global_keyid(void) +static __init int config_global_keyid(void) { cpumask_var_t packages; int cpu, ret = -EINVAL; @@ -1026,7 +1100,7 @@ static int config_global_keyid(void) return ret; } -static int init_tdmr(struct tdmr_info *tdmr) +static __init int init_tdmr(struct tdmr_info *tdmr) { u64 next; @@ -1057,7 +1131,7 @@ static int init_tdmr(struct tdmr_info *tdmr) return 0; } -static int init_tdmrs(struct tdmr_info_list *tdmr_list) +static __init int init_tdmrs(struct tdmr_info_list *tdmr_list) { int i; @@ -1076,7 +1150,7 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list) return 0; } -static int init_tdx_module(void) +static __init int init_tdx_module(void) { int ret; @@ -1157,67 +1231,50 @@ err_free_tdxmem: goto out_put_tdxmem; } -static int __tdx_enable(void) +static __init int tdx_enable(void) { + enum cpuhp_state state; int ret; - ret = init_tdx_module(); - if (ret) { - pr_err("module initialization failed (%d)\n", ret); - tdx_module_status = TDX_MODULE_ERROR; - return ret; + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("TDX not supported by the host platform\n"); + return -ENODEV; } - pr_info("module initialized\n"); - tdx_module_status = TDX_MODULE_INITIALIZED; - - return 0; -} + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + pr_err("XSAVE is required for TDX\n"); + return -EINVAL; + } -/** - * tdx_enable - Enable TDX module to make it ready to run TDX guests - * - * This function assumes the caller has: 1) held read lock of CPU hotplug - * lock to prevent any new cpu from becoming online; 2) done both VMXON - * and tdx_cpu_enable() on all online cpus. - * - * This function requires there's at least one online cpu for each CPU - * package to succeed. - * - * This function can be called in parallel by multiple callers. - * - * Return 0 if TDX is enabled successfully, otherwise error. - */ -int tdx_enable(void) -{ - int ret; + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { + pr_err("MOVDIR64B is required for TDX\n"); + return -EINVAL; + } - if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { + pr_err("Self-snoop is required for TDX\n"); return -ENODEV; + } - lockdep_assert_cpus_held(); - - mutex_lock(&tdx_module_lock); + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "virt/tdx:online", + tdx_online_cpu, tdx_offline_cpu); + if (state < 0) + return state; - switch (tdx_module_status) { - case TDX_MODULE_UNINITIALIZED: - ret = __tdx_enable(); - break; - case TDX_MODULE_INITIALIZED: - /* Already initialized, great, tell the caller. */ - ret = 0; - break; - default: - /* Failed to initialize in the previous attempts */ - ret = -EINVAL; - break; + ret = init_tdx_module(); + if (ret) { + pr_err("TDX-Module initialization failed (%d)\n", ret); + cpuhp_remove_state(state); + return ret; } - mutex_unlock(&tdx_module_lock); + register_syscore(&tdx_syscore); - return ret; + tdx_module_initialized = true; + pr_info("TDX-Module initialized\n"); + return 0; } -EXPORT_SYMBOL_FOR_KVM(tdx_enable); +subsys_initcall(tdx_enable); static bool is_pamt_page(unsigned long phys) { @@ -1468,15 +1525,10 @@ void __init tdx_init(void) const struct tdx_sys_info *tdx_get_sysinfo(void) { - const struct tdx_sys_info *p = NULL; - - /* Make sure all fields in @tdx_sysinfo have been populated */ - mutex_lock(&tdx_module_lock); - if (tdx_module_status == TDX_MODULE_INITIALIZED) - p = (const struct tdx_sys_info *)&tdx_sysinfo; - mutex_unlock(&tdx_module_lock); + if (!tdx_module_initialized) + return NULL; - return p; + return (const struct tdx_sys_info *)&tdx_sysinfo; } EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 82bb82be8567..dde219c823b4 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -91,14 +91,6 @@ struct tdmr_info { * Do not put any hardware-defined TDX structure representations below * this comment! */ - -/* Kernel defined TDX module status during module initialization. */ -enum tdx_module_status_t { - TDX_MODULE_UNINITIALIZED, - TDX_MODULE_INITIALIZED, - TDX_MODULE_ERROR -}; - struct tdx_memblock { struct list_head list; unsigned long start_pfn; diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c index 13ad2663488b..360963bc9328 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -7,7 +7,7 @@ * Include this file to other C file instead. */ -static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features) +static __init int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features) { int ret = 0; u64 val; @@ -18,7 +18,7 @@ static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_featu return ret; } -static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) +static __init int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) { int ret = 0; u64 val; @@ -37,7 +37,7 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) return ret; } -static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) +static __init int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) { int ret = 0; u64 val; @@ -52,7 +52,7 @@ static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl return ret; } -static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) +static __init int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) { int ret = 0; u64 val; @@ -85,7 +85,7 @@ static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf return ret; } -static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) +static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo) { int ret = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34b373bb5bd0..d2ac8777c766 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1629,6 +1629,13 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING /* + * kvm_arch_shutdown() is invoked immediately prior to forcefully disabling + * hardware virtualization on all CPUs via IPI function calls (in preparation + * for shutdown or reboot), e.g. to allow arch code to prepare for disabling + * virtualization while KVM may be actively running vCPUs. + */ +void kvm_arch_shutdown(void); +/* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of * kvm_usage_count, i.e. at the beginning of the generic hardware enabling @@ -2301,7 +2308,6 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool enable_virt_at_load; -extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; @@ -2596,12 +2602,4 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -int kvm_enable_virtualization(void); -void kvm_disable_virtualization(void); -#else -static inline int kvm_enable_virtualization(void) { return 0; } -static inline void kvm_disable_virtualization(void) { } -#endif - #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 46d79fdde6f5..9faf70ccae7a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1102,6 +1102,9 @@ static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, !refcount_read(&kvm->users_count)); } +static int kvm_enable_virtualization(void); +static void kvm_disable_virtualization(void); + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -5578,13 +5581,15 @@ bool __ro_after_init enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); - static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; +__weak void kvm_arch_shutdown(void) +{ + +} + __weak void kvm_arch_enable_virtualization(void) { @@ -5638,10 +5643,9 @@ static int kvm_offline_cpu(unsigned int cpu) static void kvm_shutdown(void *data) { + kvm_arch_shutdown(); + /* - * Disable hardware virtualization and set kvm_rebooting to indicate - * that KVM has asynchronously disabled hardware virtualization, i.e. - * that relevant errors and exceptions aren't entirely unexpected. * Some flavors of hardware virtualization need to be disabled before * transferring control to firmware (to perform shutdown/reboot), e.g. * on x86, virtualization can block INIT interrupts, which are used by @@ -5650,7 +5654,6 @@ static void kvm_shutdown(void *data) * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); - kvm_rebooting = true; on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } @@ -5689,7 +5692,7 @@ static struct syscore kvm_syscore = { .ops = &kvm_syscore_ops, }; -int kvm_enable_virtualization(void) +static int kvm_enable_virtualization(void) { int r; @@ -5734,9 +5737,8 @@ err_cpuhp: --kvm_usage_count; return r; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); -void kvm_disable_virtualization(void) +static void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5747,7 +5749,6 @@ void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5763,6 +5764,14 @@ static void kvm_uninit_virtualization(void) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int kvm_enable_virtualization(void) +{ + return 0; +} +static void kvm_disable_virtualization(void) +{ + +} static int kvm_init_virtualization(void) { return 0; |
