From 6fef518594bcb7e374f809717281bd02894929f8 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 25 Apr 2024 15:07:01 -0700 Subject: KVM: x86: Add a capability to configure bus frequency for APIC timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add KVM_CAP_X86_APIC_BUS_CYCLES_NS capability to configure the APIC bus clock frequency for APIC timer emulation. Allow KVM_ENABLE_CAPABILITY(KVM_CAP_X86_APIC_BUS_CYCLES_NS) to set the frequency in nanoseconds. When using this capability, the user space VMM should configure CPUID leaf 0x15 to advertise the frequency. Vishal reported that the TDX guest kernel expects a 25MHz APIC bus frequency but ends up getting interrupts at a significantly higher rate. The TDX architecture hard-codes the core crystal clock frequency to 25MHz and mandates exposing it via CPUID leaf 0x15. The TDX architecture does not allow the VMM to override the value. In addition, per Intel SDM: "The APIC timer frequency will be the processor’s bus clock or core crystal clock frequency (when TSC/core crystal clock ratio is enumerated in CPUID leaf 0x15) divided by the value specified in the divide configuration register." The resulting 25MHz APIC bus frequency conflicts with the KVM hardcoded APIC bus frequency of 1GHz. The KVM doesn't enumerate CPUID leaf 0x15 to the guest unless the user space VMM sets it using KVM_SET_CPUID. If the CPUID leaf 0x15 is enumerated, the guest kernel uses it as the APIC bus frequency. If not, the guest kernel measures the frequency based on other known timers like the ACPI timer or the legacy PIT. As reported by Vishal the TDX guest kernel expects a 25MHz timer frequency but gets timer interrupt more frequently due to the 1GHz frequency used by KVM. To ensure that the guest doesn't have a conflicting view of the APIC bus frequency, allow the userspace to tell KVM to use the same frequency that TDX mandates instead of the default 1Ghz. Reported-by: Vishal Annapurve Closes: https://lore.kernel.org/lkml/20231006011255.4163884-1-vannapurve@google.com Signed-off-by: Isaku Yamahata Reviewed-by: Rick Edgecombe Co-developed-by: Reinette Chatre Signed-off-by: Reinette Chatre Reviewed-by: Xiaoyao Li Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/6748a4c12269e756f0c48680da8ccc5367c31ce7.1714081726.git.reinette.chatre@intel.com Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d03842abae57..ec998e6b6555 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 236 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From 85542adb65ecd7cc0e442e8befef74f2ed07f5f6 Mon Sep 17 00:00:00 2001 From: Thomas Prescher Date: Wed, 8 May 2024 15:25:01 +0200 Subject: KVM: x86: Add KVM_RUN_X86_GUEST_MODE kvm_run flag When a vCPU is interrupted by a signal while running a nested guest, KVM will exit to userspace with L2 state. However, userspace has no way to know whether it sees L1 or L2 state (besides calling KVM_GET_STATS_FD, which does not have a stable ABI). This causes multiple problems: The simplest one is L2 state corruption when userspace marks the sregs as dirty. See this mailing list thread [1] for a complete discussion. Another problem is that if userspace decides to continue by emulating instructions, it will unknowingly emulate with L2 state as if L1 doesn't exist, which can be considered a weird guest escape. Introduce a new flag, KVM_RUN_X86_GUEST_MODE, in the kvm_run data structure, which is set when the vCPU exited while running a nested guest. Also introduce a new capability, KVM_CAP_X86_GUEST_MODE, to advertise the functionality to userspace. [1] https://lore.kernel.org/kvm/20240416123558.212040-1-julian.stecklina@cyberus-technology.de/T/#m280aadcb2e10ae02c191a7dc4ed4b711a74b1f55 Signed-off-by: Thomas Prescher Signed-off-by: Julian Stecklina Link: https://lore.kernel.org/r/20240508132502.184428-1-julian.stecklina@cyberus-technology.de Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ec998e6b6555..a6ac00ec77ad 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -918,6 +918,7 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 236 +#define KVM_CAP_X86_GUEST_MODE 237 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From 4b23e0c199b20fa6fe9655b3d0e12d6c6f18c27f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 3 May 2024 11:17:33 -0700 Subject: KVM: Ensure new code that references immediate_exit gets extra scrutiny Ensure that any new KVM code that references immediate_exit gets extra scrutiny by renaming it to immediate_exit__unsafe in kernel code. All fields in struct kvm_run are subject to TOCTOU races since they are mapped into userspace, which may be malicious or buggy. To protect KVM, introduces a new macro that appends __unsafe to select field names in struct kvm_run, hinting to developers and reviewers that accessing such fields must be done carefully. Apply the new macro to immediate_exit, since userspace can make immediate_exit inconsistent with vcpu->wants_to_run, i.e. accessing immediate_exit directly could lead to unexpected bugs in the future. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240503181734.1467938-3-dmatlack@google.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d03842abae57..795773f5db63 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -192,11 +192,24 @@ struct kvm_xen_exit { /* Flags that describe what fields in emulation_failure hold valid data. */ #define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) +/* + * struct kvm_run can be modified by userspace at any time, so KVM must be + * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM() + * renames fields in struct kvm_run from to __unsafe when + * compiled into the kernel, ensuring that any use within KVM is obvious and + * gets extra scrutiny. + */ +#ifdef __KERNEL__ +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe +#else +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol +#endif + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u8 request_interrupt_window; - __u8 immediate_exit; + __u8 HINT_UNSAFE_IN_KVM(immediate_exit); __u8 padding1[6]; /* out */ -- cgit v1.2.3 From bc1a5cd002116552db4c3541e91f8a5b1b0cf65d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 10 Apr 2024 15:07:28 -0700 Subject: KVM: Add KVM_PRE_FAULT_MEMORY vcpu ioctl to pre-populate guest memory Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the memory range and calls the arch-specific function. The implementation is optional and enabled by a Kconfig symbol. Suggested-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Rick Edgecombe Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d03842abae57..e5af8c692dc0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_PRE_FAULT_MEMORY 236 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + +struct kvm_pre_fault_memory { + __u64 gpa; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From 88caf544c9305313e1c48ac1a437faa5df8fff06 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 1 Jul 2024 17:31:46 -0500 Subject: KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Reviewed-by: Tom Lendacky Reviewed-by: Liam Merwick [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/sev-guest.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h index 154a87a1eca9..fcdfea767fca 100644 --- a/include/uapi/linux/sev-guest.h +++ b/include/uapi/linux/sev-guest.h @@ -89,6 +89,9 @@ struct snp_ext_report_req { #define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0) #define SNP_GUEST_VMM_ERR_SHIFT 32 #define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT) +#define SNP_GUEST_FW_ERR(x) ((x) & SNP_GUEST_FW_ERR_MASK) +#define SNP_GUEST_ERR(vmm_err, fw_err) (SNP_GUEST_VMM_ERR(vmm_err) | \ + SNP_GUEST_FW_ERR(fw_err)) #define SNP_GUEST_VMM_ERR_INVALID_LEN 1 #define SNP_GUEST_VMM_ERR_BUSY 2 -- cgit v1.2.3 From 332d2c1d713e232e163386c35a3ba0c1b90df83f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:52:10 -0500 Subject: crypto: ccp: Add the SNP_VLEK_LOAD command When requesting an attestation report a guest is able to specify whether it wants SNP firmware to sign the report using either a Versioned Chip Endorsement Key (VCEK), which is derived from chip-unique secrets, or a Versioned Loaded Endorsement Key (VLEK) which is obtained from an AMD Key Derivation Service (KDS) and derived from seeds allocated to enrolled cloud service providers (CSPs). For VLEK keys, an SNP_VLEK_LOAD SNP firmware command is used to load them into the system after obtaining them from the KDS. Add a corresponding userspace interface so to allow the loading of VLEK keys into the system. See SEV-SNP Firmware ABI 1.54, SNP_VLEK_LOAD for more details. Reviewed-by: Tom Lendacky Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-21-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/psp-sev.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index b7a2c2ee35b7..2289b7c76c59 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -31,6 +31,7 @@ enum { SNP_PLATFORM_STATUS, SNP_COMMIT, SNP_SET_CONFIG, + SNP_VLEK_LOAD, SEV_MAX, }; @@ -214,6 +215,32 @@ struct sev_user_data_snp_config { __u8 rsvd1[52]; } __packed; +/** + * struct sev_data_snp_vlek_load - SNP_VLEK_LOAD structure + * + * @len: length of the command buffer read by the PSP + * @vlek_wrapped_version: version of wrapped VLEK hashstick (Must be 0h) + * @rsvd: reserved + * @vlek_wrapped_address: address of a wrapped VLEK hashstick + * (struct sev_user_data_snp_wrapped_vlek_hashstick) + */ +struct sev_user_data_snp_vlek_load { + __u32 len; /* In */ + __u8 vlek_wrapped_version; /* In */ + __u8 rsvd[3]; /* In */ + __u64 vlek_wrapped_address; /* In */ +} __packed; + +/** + * struct sev_user_data_snp_vlek_wrapped_vlek_hashstick - Wrapped VLEK data + * + * @data: Opaque data provided by AMD KDS (as described in SEV-SNP Firmware ABI + * 1.54, SNP_VLEK_LOAD) + */ +struct sev_user_data_snp_wrapped_vlek_hashstick { + __u8 data[432]; /* In */ +} __packed; + /** * struct sev_issue_cmd - SEV ioctl parameters * -- cgit v1.2.3