From fc3ba56385d03501eb582e4b86691ba378e556f9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 16 Dec 2025 08:17:54 -0800
Subject: KVM: nSVM: Remove a user-triggerable WARN on nested_svm_load_cr3()
 succeeding

Drop the WARN in svm_set_nested_state() on nested_svm_load_cr3() failing
as it is trivially easy to trigger from userspace by modifying CPUID after
loading CR3.  E.g. modifying the state restoration selftest like so:

  --- tools/testing/selftests/kvm/x86/state_test.c
  +++ tools/testing/selftests/kvm/x86/state_test.c
  @@ -280,7 +280,16 @@ int main(int argc, char *argv[])

                 /* Restore state in a new VM.  */
                  vcpu = vm_recreate_with_one_vcpu(vm);
  -               vcpu_load_state(vcpu, state);
  +
  +               if (stage == 4) {
  +                       state->sregs.cr3 = BIT(44);
  +                       vcpu_load_state(vcpu, state);
  +
  +                       vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, 36);
  +                       __vcpu_nested_state_set(vcpu, &state->nested);
  +               } else {
  +                       vcpu_load_state(vcpu, state);
  +               }

                  /*
                   * Restore XSAVE state in a dummy vCPU, first without doing

generates:

  WARNING: CPU: 30 PID: 938 at arch/x86/kvm/svm/nested.c:1877 svm_set_nested_state+0x34a/0x360 [kvm_amd]
  Modules linked in: kvm_amd kvm irqbypass [last unloaded: kvm]
  CPU: 30 UID: 1000 PID: 938 Comm: state_test Tainted: G        W           6.18.0-rc7-58e10b63777d-next-vm
  Tainted: [W]=WARN
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:svm_set_nested_state+0x34a/0x360 [kvm_amd]
  Call Trace:
   <TASK>
   kvm_arch_vcpu_ioctl+0xf33/0x1700 [kvm]
   kvm_vcpu_ioctl+0x4e6/0x8f0 [kvm]
   __x64_sys_ioctl+0x8f/0xd0
   do_syscall_64+0x61/0xad0
   entry_SYSCALL_64_after_hwframe+0x4b/0x53

Simply delete the WARN instead of trying to prevent userspace from shoving
"illegal" state into CR3.  For better or worse, KVM's ABI allows userspace
to set CPUID after SREGS, and vice versa, and KVM is very permissive when
it comes to guest CPUID.  I.e. attempting to enforce the virtual CPU model
when setting CPUID could break userspace.  Given that the WARN doesn't
provide any meaningful protection for KVM or benefit for userspace, simply
drop it even though the odds of breaking userspace are minuscule.

Opportunistically delete a spurious newline.

Fixes: b222b0b88162 ("KVM: nSVM: refactor the CR3 reload on migration")
Cc: stable@vger.kernel.org
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251216161755.1775409-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index ba0f11c68372..9be67040e94d 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1870,10 +1870,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 	 * thus MMU might not be initialized correctly.
 	 * Set it again to fix this.
 	 */
-
 	ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
 				  nested_npt_enabled(svm), false);
-	if (WARN_ON_ONCE(ret))
+	if (ret)
 		goto out_free;
 
 	svm->nested.force_msr_bitmap_recalc = true;
-- 
cgit v1.2.3


From 737f2a382f89f2ff3d9d6a737004d97bfb98dc56 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 13 Nov 2025 14:16:41 -0800
Subject: KVM: SVM: Rename "fault_address" to "gpa" in npf_interception()

Rename "fault_address" to "gpa" in KVM's #NPF handler and track it as a
gpa_t to more precisely document what type of address is being captured,
and because "gpa" is much more succinct.

No functional change intended.

Link: https://patch.msgid.link/20251113221642.1673023-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 24d59ccfa40d..af018c1196b5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1867,8 +1867,8 @@ static int npf_interception(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	int rc;
 
-	u64 fault_address = svm->vmcb->control.exit_info_2;
 	u64 error_code = svm->vmcb->control.exit_info_1;
+	gpa_t gpa = svm->vmcb->control.exit_info_2;
 
 	/*
 	 * WARN if hardware generates a fault with an error code that collides
@@ -1882,14 +1882,14 @@ static int npf_interception(struct kvm_vcpu *vcpu)
 	if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
 		error_code |= PFERR_PRIVATE_ACCESS;
 
-	trace_kvm_page_fault(vcpu, fault_address, error_code);
-	rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+	trace_kvm_page_fault(vcpu, gpa, error_code);
+	rc = kvm_mmu_page_fault(vcpu, gpa, error_code,
 				static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
 				svm->vmcb->control.insn_bytes : NULL,
 				svm->vmcb->control.insn_len);
 
 	if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
-		sev_handle_rmp_fault(vcpu, fault_address, error_code);
+		sev_handle_rmp_fault(vcpu, gpa, error_code);
 
 	return rc;
 }
-- 
cgit v1.2.3


From 01cde4eaaecaf5df158234f0a52b4a1c55796858 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 13 Nov 2025 14:16:42 -0800
Subject: KVM: SVM: Add support for expedited writes to the fast MMIO bus

Wire up SVM's #NPF handler to fast MMIO.  While SVM doesn't provide a
dedicated exit reason, it's trivial to key off PFERR_RSVD_MASK.  Like VMX,
restrict the fast path to L1 to avoid having to deal with nGPA=>GPA
translations.

For simplicity, use the fast path if and only if the next RIP is known.
While KVM could utilize EMULTYPE_SKIP, doing so would require additional
logic to deal with SEV guests, e.g. to go down the slow path if the
instruction buffer is empty.  All modern CPUs support next RIP, and in
practice the next RIP will be available for any guest fast path.

Copy+paste the kvm_io_bus_write() + trace_kvm_fast_mmio() logic even
though KVM would ideally provide a small helper, as such a helper would
need to either be a macro or non-inline to avoid including trace.h in a
header (trace.h must not be included by x86.c prior to CREATE_TRACE_POINTS
being defined).

Link: https://patch.msgid.link/20251113221642.1673023-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index af018c1196b5..d1ff23e02ecd 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1862,6 +1862,9 @@ static int pf_interception(struct kvm_vcpu *vcpu)
 			svm->vmcb->control.insn_len);
 }
 
+static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+					 void *insn, int insn_len);
+
 static int npf_interception(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1879,6 +1882,24 @@ static int npf_interception(struct kvm_vcpu *vcpu)
 	if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
 		error_code &= ~PFERR_SYNTHETIC_MASK;
 
+	/*
+	 * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed
+	 * emulate a page fault, e.g. skipping the current instruction is wrong
+	 * if the #NPF occurred while vectoring an event.
+	 */
+	if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) {
+		const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE;
+
+		if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0))
+			return 1;
+
+		if (nrips && svm->vmcb->control.next_rip &&
+		    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+			trace_kvm_fast_mmio(gpa);
+			return kvm_skip_emulated_instruction(vcpu);
+		}
+	}
+
 	if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
 		error_code |= PFERR_PRIVATE_ACCESS;
 
-- 
cgit v1.2.3


From 1d1722e52fcd70deb53d8c192f958fe34be14f5e Mon Sep 17 00:00:00 2001
From: Kevin Cheng <chengkev@google.com>
Date: Mon, 15 Dec 2025 19:25:10 +0000
Subject: KVM: SVM: Don't allow L1 intercepts for instructions not advertised

If a feature is not advertised in the guest's CPUID, prevent L1 from
intercepting the unsupported instructions by clearing the corresponding
intercept in KVM's cached vmcb12.

When an L2 guest executes an instruction that is not advertised to L1,
we expect a #UD exception to be injected by L0. However, the nested svm
exit handler first checks if the instruction intercept is set in vmcb12,
and if so, synthesizes an exit from L2 to L1 instead of a #UD exception.
If a feature is not advertised, the L1 intercept should be ignored.

While creating KVM's cached vmcb12, sanitize the intercepts for
instructions that are not advertised in the guest CPUID. This
effectively ignores the L1 intercept on nested vm exit handling. It also
ignores the L1 intercept when computing the intercepts in vmcb02, so if
L0 (for some reason) does not intercept the instruction, KVM won't
intercept it at all.

Signed-off-by: Kevin Cheng <chengkev@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251215192510.2300816-1-chengkev@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 19 +++++++++++++++++++
 arch/x86/kvm/svm/svm.h    | 35 +++++++++++++++++++++++++++--------
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 9be67040e94d..aa1bea134ace 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -403,6 +403,19 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
 	return __nested_vmcb_check_controls(vcpu, ctl);
 }
 
+/*
+ * If a feature is not advertised to L1, clear the corresponding vmcb12
+ * intercept.
+ */
+#define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname)	\
+do {										\
+	if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname))			\
+		vmcb12_clr_intercept(__control, INTERCEPT_##iname);		\
+} while (0)
+
+#define nested_svm_sanitize_intercept(__vcpu, __control, name)			\
+	__nested_svm_sanitize_intercept(__vcpu, __control, name, name)
+
 static
 void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
 					 struct vmcb_ctrl_area_cached *to,
@@ -413,6 +426,12 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
 	for (i = 0; i < MAX_INTERCEPT; i++)
 		to->intercepts[i] = from->intercepts[i];
 
+	__nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV);
+	nested_svm_sanitize_intercept(vcpu, to, INVPCID);
+	nested_svm_sanitize_intercept(vcpu, to, RDTSCP);
+	nested_svm_sanitize_intercept(vcpu, to, SKINIT);
+	nested_svm_sanitize_intercept(vcpu, to, RDPRU);
+
 	to->iopm_base_pa        = from->iopm_base_pa;
 	to->msrpm_base_pa       = from->msrpm_base_pa;
 	to->tsc_offset          = from->tsc_offset;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 01be93a53d07..806e68ba821b 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -434,28 +434,47 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
  */
 #define SVM_REGS_LAZY_LOAD_SET	(1 << VCPU_EXREG_PDPTR)
 
-static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
+static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit)
 {
 	WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
-	__set_bit(bit, (unsigned long *)&control->intercepts);
+	__set_bit(bit, intercepts);
 }
 
-static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
+static inline void __vmcb_clr_intercept(unsigned long *intercepts, u32 bit)
 {
 	WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
-	__clear_bit(bit, (unsigned long *)&control->intercepts);
+	__clear_bit(bit, intercepts);
 }
 
-static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
+static inline bool __vmcb_is_intercept(unsigned long *intercepts, u32 bit)
 {
 	WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
-	return test_bit(bit, (unsigned long *)&control->intercepts);
+	return test_bit(bit, intercepts);
+}
+
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
+{
+	__vmcb_set_intercept((unsigned long *)&control->intercepts, bit);
+}
+
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
+{
+	__vmcb_clr_intercept((unsigned long *)&control->intercepts, bit);
+}
+
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
+{
+	return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
+}
+
+static inline void vmcb12_clr_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+	__vmcb_clr_intercept((unsigned long *)&control->intercepts, bit);
 }
 
 static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
 {
-	WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
-	return test_bit(bit, (unsigned long *)&control->intercepts);
+	return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
 }
 
 static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
-- 
cgit v1.2.3


From db5e82496492b4890b1c3356581c016767ed527f Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@amd.com>
Date: Fri, 7 Nov 2025 10:32:39 +0100
Subject: KVM: SVM: Virtualize and advertise support for ERAPS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AMD CPUs with the Enhanced Return Address Predictor Security (ERAPS)
feature (available on Zen5+) obviate the need for FILL_RETURN_BUFFER
sequences right after VMEXITs.  ERAPS adds guest/host tags to entries in
the RSB (a.k.a. RAP).  This helps with speculation protection across the
VM boundary, and it also preserves host and guest entries in the RSB that
can improve software performance (which would otherwise be flushed due to
the FILL_RETURN_BUFFER sequences).

Importantly, ERAPS also improves cross-domain security by clearing the RAP
in certain situations.  Specifically, the RAP is cleared in response to
actions that are typically tied to software context switching between
tasks.  Per the APM:

  The ERAPS feature eliminates the need to execute CALL instructions to
  clear the return address predictor in most cases. On processors that
  support ERAPS, return addresses from CALL instructions executed in host
  mode are not used in guest mode, and vice versa. Additionally, the
  return address predictor is cleared in all cases when the TLB is
  implicitly invalidated and in the following cases:

  • MOV CR3 instruction
  • INVPCID other than single address invalidation (operation type 0)

ERAPS also allows CPUs to extends the size of the RSB/RAP from the older
standard (of 32 entries) to a new size, enumerated in CPUID leaf
0x80000021:EBX bits 23:16 (64 entries in Zen5 CPUs).

In hardware, ERAPS is always-on, when running in host context, the CPU
uses the full RSB/RAP size without any software changes necessary.
However, when running in guest context, the CPU utilizes the full size of
the RSB/RAP if and only if the new ALLOW_LARGER_RAP flag is set in the
VMCB; if the flag is not set, the CPU limits itself to the historical size
of 32 entires.

Requiring software to opt-in for guest usage of RAPs larger than 32 entries
allows hypervisors, i.e. KVM, to emulate the aforementioned conditions in
which the RAP is cleared as well as the guest/host split.  E.g. if the CPU
unconditionally used the full RAP for guests, failure to clear the RAP on
transitions between L1 or L2, or on emulated guest TLB flushes, would
expose the guest to RAP-based attacks as a guest without support for ERAPS
wouldn't know that its FILL_RETURN_BUFFER sequence is insufficient.

Address the ~two broad categories of ERAPS emulation, and advertise
ERAPS support to userspace, along with the RAP size enumerated in CPUID.

1. Architectural RAP clearing: as above, CPUs with ERAPS clear RAP entries
   on several conditions, including CR3 updates.  To handle scenarios
   where a relevant operation is handled in common code (emulation of
   INVPCID and to a lesser extent MOV CR3), piggyback VCPU_EXREG_CR3 and
   create an alias, VCPU_EXREG_ERAPS.  SVM doesn't utilize CR3 dirty
   tracking, and so for all intents and purposes VCPU_EXREG_CR3 is unused.
   Aliasing VCPU_EXREG_ERAPS ensures that any flow that writes CR3 will
   also clear the guest's RAP, and allows common x86 to mark ERAPS vCPUs
   as needing a RAP clear without having to add a new request (or other
   mechanism).

2. Nested guests: the ERAPS feature adds host/guest tagging to entries
   in the RSB, but does not distinguish between the guest ASIDs.  To
   prevent the case of an L2 guest poisoning the RSB to attack the L1
   guest, the CPU exposes a new VMCB bit (CLEAR_RAP).  The next
   VMRUN with a VMCB that has this bit set causes the CPU to flush the
   RSB before entering the guest context.  Set the bit in VMCB01 after a
   nested #VMEXIT to ensure the next time the L1 guest runs, its RSB
   contents aren't polluted by the L2's contents.  Similarly, before
   entry into a nested guest, set the bit for VMCB02, so that the L1
   guest's RSB contents are not leaked/used in the L2 context.

Enable ALLOW_LARGER_RAP (and emulate RAP clears) if and only if ERAPS is
exposed to the guest.  Enabling ALLOW_LARGER_RAP unconditionally wouldn't
cause any functional issues, but ignoring userspace's (and L1's) desires
would put KVM into a grey area, which is especially undesirable due to the
potential security implications.  E.g. if a use case wants to have L1 do
manual RAP clearing even when ERAPS is present in hardware, enabling
ALLOW_LARGER_RAP could result in L1 leaving stale entries in the RAP.

ERAPS is documented in AMD APM Vol 2 (Pub 24593), in revisions 3.43 and
later.

Signed-off-by: Amit Shah <amit.shah@amd.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Amit Shah <amit.shah@amd.com>
Link: https://patch.msgid.link/aR913X8EqO6meCqa@google.com
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  8 ++++++++
 arch/x86/include/asm/svm.h         |  6 +++++-
 arch/x86/kvm/cpuid.c               |  9 ++++++++-
 arch/x86/kvm/svm/nested.c          | 18 ++++++++++++++++++
 arch/x86/kvm/svm/svm.c             | 25 ++++++++++++++++++++++++-
 arch/x86/kvm/svm/svm.h             |  1 +
 arch/x86/kvm/x86.c                 | 12 ++++++++++++
 8 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index c3b53beb1300..81f7b3b91986 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -472,6 +472,7 @@
 #define X86_FEATURE_GP_ON_USER_CPUID	(20*32+17) /* User CPUID faulting */
 
 #define X86_FEATURE_PREFETCHI		(20*32+20) /* Prefetch Data/Instruction to Cache Level */
+#define X86_FEATURE_ERAPS		(20*32+24) /* Enhanced Return Address Predictor Security */
 #define X86_FEATURE_SBPB		(20*32+27) /* Selective Branch Prediction Barrier */
 #define X86_FEATURE_IBPB_BRTYPE		(20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
 #define X86_FEATURE_SRSO_NO		(20*32+29) /* CPU is not affected by SRSO */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a3bfa293e8b..0353d8b6988c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -195,7 +195,15 @@ enum kvm_reg {
 
 	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
 	VCPU_EXREG_CR0,
+	/*
+	 * Alias AMD's ERAPS (not a real register) to CR3 so that common code
+	 * can trigger emulation of the RAP (Return Address Predictor) with
+	 * minimal support required in common code.  Piggyback CR3 as the RAP
+	 * is cleared on writes to CR3, i.e. marking CR3 dirty will naturally
+	 * mark ERAPS dirty as well.
+	 */
 	VCPU_EXREG_CR3,
+	VCPU_EXREG_ERAPS = VCPU_EXREG_CR3,
 	VCPU_EXREG_CR4,
 	VCPU_EXREG_RFLAGS,
 	VCPU_EXREG_SEGMENTS,
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 56aa99503dc4..50ece197c98a 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -131,7 +131,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u64 tsc_offset;
 	u32 asid;
 	u8 tlb_ctl;
-	u8 reserved_2[3];
+	u8 erap_ctl;
+	u8 reserved_2[2];
 	u32 int_ctl;
 	u32 int_vector;
 	u32 int_state;
@@ -182,6 +183,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define TLB_CONTROL_FLUSH_ASID 3
 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7
 
+#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0)
+#define ERAP_CONTROL_CLEAR_RAP BIT(1)
+
 #define V_TPR_MASK 0x0f
 
 #define V_IRQ_SHIFT 8
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 88a5426674a1..c590a5bd3196 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1223,6 +1223,7 @@ void kvm_set_cpu_caps(void)
 		/* PrefetchCtlMsr */
 		/* GpOnUserCpuid */
 		/* EPSF */
+		F(ERAPS),
 		SYNTHESIZED_F(SBPB),
 		SYNTHESIZED_F(IBPB_BRTYPE),
 		SYNTHESIZED_F(SRSO_NO),
@@ -1803,8 +1804,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 		break;
 	case 0x80000021:
-		entry->ebx = entry->edx = 0;
+		entry->edx = 0;
 		cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+
+		if (kvm_cpu_cap_has(X86_FEATURE_ERAPS))
+			entry->ebx &= GENMASK(23, 16);
+		else
+			entry->ebx = 0;
+
 		cpuid_entry_override(entry, CPUID_8000_0021_ECX);
 		break;
 	/* AMD Extended Performance Monitoring and Debug */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index aa1bea134ace..5a1e1164c197 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -436,6 +436,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
 	to->msrpm_base_pa       = from->msrpm_base_pa;
 	to->tsc_offset          = from->tsc_offset;
 	to->tlb_ctl             = from->tlb_ctl;
+	to->erap_ctl            = from->erap_ctl;
 	to->int_ctl             = from->int_ctl;
 	to->int_vector          = from->int_vector;
 	to->int_state           = from->int_state;
@@ -885,6 +886,19 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
 		}
 	}
 
+	/*
+	 * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to
+	 * let L2 use a larger RAP since KVM will emulate the necessary clears,
+	 * as it's possible L1 deliberately wants to restrict L2 to the legacy
+	 * RAP size.  Unconditionally clear the RAP on nested VMRUN, as KVM is
+	 * responsible for emulating the host vs. guest tags (L1 is the "host",
+	 * L2 is the "guest").
+	 */
+	if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
+		vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl &
+					    ERAP_CONTROL_ALLOW_LARGER_RAP) |
+					   ERAP_CONTROL_CLEAR_RAP;
+
 	/*
 	 * Merge guest and host intercepts - must be called with vcpu in
 	 * guest-mode to take effect.
@@ -1180,6 +1194,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	kvm_nested_vmexit_handle_ibrs(vcpu);
 
+	if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
+		vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
+
 	svm_switch_vmcb(svm, &svm->vmcb01);
 
 	/*
@@ -1686,6 +1703,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
 	dst->tsc_offset           = from->tsc_offset;
 	dst->asid                 = from->asid;
 	dst->tlb_ctl              = from->tlb_ctl;
+	dst->erap_ctl             = from->erap_ctl;
 	dst->int_ctl              = from->int_ctl;
 	dst->int_vector           = from->int_vector;
 	dst->int_state            = from->int_state;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d1ff23e02ecd..34c8a94b1b81 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1141,6 +1141,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
 		svm_clr_intercept(svm, INTERCEPT_PAUSE);
 	}
 
+	if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
+		svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP;
+
 	if (kvm_vcpu_apicv_active(vcpu))
 		avic_init_vmcb(svm, vmcb);
 
@@ -3293,6 +3296,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
 	pr_err("%-20s%d\n", "asid:", control->asid);
 	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+	pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
 	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
 	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
 	pr_err("%-20s%08x\n", "int_state:", control->int_state);
@@ -4004,6 +4008,13 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
 	invlpga(gva, svm->vmcb->control.asid);
 }
 
+static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+	kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
+
+	svm_flush_tlb_asid(vcpu);
+}
+
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4262,6 +4273,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 	}
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
+	if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) &&
+	    kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS))
+		svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
+
 	svm_hv_update_vp_id(svm->vmcb, vcpu);
 
 	/*
@@ -4339,6 +4354,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 	}
 
 	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+	/*
+	 * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap
+	 * as the TEST+Jcc to avoid it.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_ERAPS))
+		svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP;
+
 	vmcb_mark_all_clean(svm->vmcb);
 
 	/* if exit due to PF check for async PF */
@@ -5094,7 +5117,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
 	.flush_tlb_all = svm_flush_tlb_all,
 	.flush_tlb_current = svm_flush_tlb_current,
 	.flush_tlb_gva = svm_flush_tlb_gva,
-	.flush_tlb_guest = svm_flush_tlb_asid,
+	.flush_tlb_guest = svm_flush_tlb_guest,
 
 	.vcpu_pre_run = svm_vcpu_pre_run,
 	.vcpu_run = svm_vcpu_run,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 806e68ba821b..7d28a739865f 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -156,6 +156,7 @@ struct vmcb_ctrl_area_cached {
 	u64 tsc_offset;
 	u32 asid;
 	u8 tlb_ctl;
+	u8 erap_ctl;
 	u32 int_ctl;
 	u32 int_vector;
 	u32 int_state;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff8812f3a129..e013392fe20c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -14130,6 +14130,13 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 			return 1;
 		}
 
+		/*
+		 * When ERAPS is supported, invalidating a specific PCID clears
+		 * the RAP (Return Address Predicator).
+		 */
+		if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
+			kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS);
+
 		kvm_invalidate_pcid(vcpu, operand.pcid);
 		return kvm_skip_emulated_instruction(vcpu);
 
@@ -14143,6 +14150,11 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
 		fallthrough;
 	case INVPCID_TYPE_ALL_INCL_GLOBAL:
+		/*
+		 * Don't bother marking VCPU_EXREG_ERAPS dirty, SVM will take
+		 * care of doing so when emulating the full guest TLB flush
+		 * (the RAP is cleared on all implicit TLB flushes).
+		 */
 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
 		return kvm_skip_emulated_instruction(vcpu);
 
-- 
cgit v1.2.3


From 8312f1b9dd71340b5fff65e56c6c163187bfa5d0 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 21 Nov 2025 20:48:01 +0000
Subject: KVM: SVM: Don't set GIF when clearing EFER.SVME

Clearing EFER.SVME is not architected to set GIF. Don't set GIF when
emulating a change to EFER that clears EFER.SVME.

However, keep setting GIF if clearing EFER.SVME causes force-leaving the
nested guest through svm_leave_nested(), to maintain a sane behavior of
not leaving GIF cleared after exiting the guest.  In every other path,
setting GIF is either correct/desirable, or irrelevant because the
caller immediately and unconditionally sets/clears GIF.

This is more-or-less KVM defining HW behavior, but leaving GIF cleared
would also be defining HW behavior anyway.

Note that if force-leaving the nested guest is considered a SHUTDOWN,
then this could violate the APM-specified behavior:

  If the processor enters the shutdown state (due to a triple fault for
  instance) while GIF is clear, it can only be restarted by means of a
  RESET.

However, a SHUTDOWN leaves the VMCB undefined, so there's not a lot that
KVM can do in this case. Also, if vGIF is enabled on SHUTDOWN, KVM has
no way of finding out of GIF was cleared.

The only way for KVM to handle this without making up HW behavior is to
completely terminate the VM, so settle for doing the relatively "sane"
thing of setting GIF when force-leaving nested.

Fixes: c513f484c558 ("KVM: nSVM: leave guest mode when clearing EFER.SVME")
Signed-off-by: Jim Mattson <jmattson@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251121204803.991707-3-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 2 ++
 arch/x86/kvm/svm/svm.c    | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 5a1e1164c197..47e8ce7d360a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1399,6 +1399,8 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
 		nested_svm_uninit_mmu_context(vcpu);
 		vmcb_mark_all_dirty(svm->vmcb);
 
+		svm_set_gif(svm, true);
+
 		if (kvm_apicv_activated(vcpu->kvm))
 			kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
 	}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 34c8a94b1b81..c7bd78f5a2c7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -215,7 +215,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 		if (!(efer & EFER_SVME)) {
 			svm_leave_nested(vcpu);
-			svm_set_gif(svm, true);
 			/* #GP intercept is still needed for vmware backdoor */
 			if (!enable_vmware_backdoor)
 				clr_exception_intercept(svm, GP_VECTOR);
-- 
cgit v1.2.3


From 6f4d3ebc24c6ef92e196ebbd389a3f2bfdc7a144 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 21 Nov 2025 20:48:00 +0000
Subject: KVM: SVM: Allow KVM_SET_NESTED_STATE to clear GIF when SVME==0

GIF==0 together with EFER.SVME==0 is a valid architectural
state. Don't return -EINVAL for KVM_SET_NESTED_STATE when this
combination is specified.

Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE")
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251121204803.991707-2-yosry.ahmed@linux.dev
[sean: disallow KVM_STATE_NESTED_RUN_PENDING with SVME=0]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 47e8ce7d360a..5b741f8ed170 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1821,12 +1821,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 	/*
 	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
 	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
+	 * If SVME is disabled, the only valid states are "none" and GIF=1
+	 * (clearing SVME does NOT set GIF, i.e. GIF=0 is allowed).
 	 */
-	if (!(vcpu->arch.efer & EFER_SVME)) {
-		/* GIF=1 and no guest mode are required if SVME=0.  */
-		if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
-			return -EINVAL;
-	}
+	if (!(vcpu->arch.efer & EFER_SVME) && kvm_state->flags &&
+	    kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
+		return -EINVAL;
 
 	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
 	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
-- 
cgit v1.2.3


From bda6ae6f29664b659671f872a2adda3c1c2f5dd6 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Fri, 21 Nov 2025 20:48:02 +0000
Subject: KVM: selftests: Use TEST_ASSERT_EQ() in test_vmx_nested_state()

The assert messages do not add much value, so use TEST_ASSERT_EQ(),
which also nicely displays the addresses in hex. While at it, also
assert the values of state->flags.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251121204803.991707-4-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
index 67a62a5a8895..b59a8a17084d 100644
--- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
@@ -241,8 +241,10 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu)
 	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
 		    "Size must be between %ld and %d.  The size returned was %d.",
 		    sizeof(*state), state_sz, state->size);
-	TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
-	TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
+
+	TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull);
+	TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull);
+	TEST_ASSERT_EQ(state->flags, 0);
 
 	free(state);
 }
-- 
cgit v1.2.3


From ca2eccb953fd33ef38701e33e660b21f7e84aa14 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Fri, 21 Nov 2025 20:48:03 +0000
Subject: KVM: selftests: Extend vmx_set_nested_state_test to cover SVM

Add test cases for the validation checks in svm_set_nested_state(), and
allow the test to run with SVM as well as VMX. The SVM test also makes
sure that KVM_SET_NESTED_STATE accepts GIF being set or cleared if
EFER.SVME is cleared, verifying a recently fixed bug where GIF was
incorrectly expected to always be set when EFER.SVME is cleared.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251121204803.991707-5-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   2 +-
 .../selftests/kvm/x86/nested_set_state_test.c      | 406 +++++++++++++++++++++
 .../selftests/kvm/x86/vmx_set_nested_state_test.c  | 306 ----------------
 3 files changed, 407 insertions(+), 307 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86/nested_set_state_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..4ddece4ee365 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test
 TEST_GEN_PROGS_x86 += x86/nested_emulation_test
 TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
 TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test
+TEST_GEN_PROGS_x86 += x86/nested_set_state_test
 TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test
 TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test
 TEST_GEN_PROGS_x86 += x86/platform_info_test
@@ -120,7 +121,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86 += x86/vmx_msrs_test
 TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state
 TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test
-TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test
 TEST_GEN_PROGS_x86 += x86/xapic_ipi_test
 TEST_GEN_PROGS_x86 += x86/xapic_state_test
diff --git a/tools/testing/selftests/kvm/x86/nested_set_state_test.c b/tools/testing/selftests/kvm/x86/nested_set_state_test.c
new file mode 100644
index 000000000000..0f2102b43629
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nested_set_state_test.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#include <errno.h>
+#include <linux/kvm.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+/*
+ * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
+ * changes this should be updated.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+bool have_evmcs;
+
+void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
+{
+	vcpu_nested_state_set(vcpu, state);
+}
+
+void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
+				    struct kvm_nested_state *state,
+				    int expected_errno)
+{
+	int rv;
+
+	rv = __vcpu_nested_state_set(vcpu, state);
+	TEST_ASSERT(rv == -1 && errno == expected_errno,
+		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
+		strerror(expected_errno), expected_errno, rv, strerror(errno),
+		errno);
+}
+
+void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_state *state)
+{
+	test_nested_state_expect_errno(vcpu, state, EINVAL);
+}
+
+void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_state *state)
+{
+	test_nested_state_expect_errno(vcpu, state, EFAULT);
+}
+
+void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
+				u32 vmcs12_revision)
+{
+	/* Set revision_id in vmcs12 to vmcs12_revision. */
+	memcpy(&state->data, &vmcs12_revision, sizeof(u32));
+}
+
+void set_default_state(struct kvm_nested_state *state)
+{
+	memset(state, 0, sizeof(*state));
+	state->flags = KVM_STATE_NESTED_RUN_PENDING |
+		       KVM_STATE_NESTED_GUEST_MODE;
+	state->format = 0;
+	state->size = sizeof(*state);
+}
+
+void set_default_vmx_state(struct kvm_nested_state *state, int size)
+{
+	memset(state, 0, size);
+	if (have_evmcs)
+		state->flags = KVM_STATE_NESTED_EVMCS;
+	state->format = 0;
+	state->size = size;
+	state->hdr.vmx.vmxon_pa = 0x1000;
+	state->hdr.vmx.vmcs12_pa = 0x2000;
+	state->hdr.vmx.smm.flags = 0;
+	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
+}
+
+void test_vmx_nested_state(struct kvm_vcpu *vcpu)
+{
+	/* Add a page for VMCS12. */
+	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+	struct kvm_nested_state *state =
+		(struct kvm_nested_state *)malloc(state_sz);
+
+	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
+	set_default_vmx_state(state, state_sz);
+	state->format = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * We cannot virtualize anything if the guest does not have VMX
+	 * enabled.
+	 */
+	set_default_vmx_state(state, state_sz);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * We cannot virtualize anything if the guest does not have VMX
+	 * enabled.  We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
+	 * is set to -1ull, but the flags must be zero.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	state->flags = KVM_STATE_NESTED_EVMCS;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	/* Enable VMX in the guest CPUID. */
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
+
+	/*
+	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
+	 * setting the nested state. When the eVMCS flag is not set, the
+	 * expected return value is '0'.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * When eVMCS is supported, the eVMCS flag can only be set if the
+	 * enlightened VMCS capability has been enabled.
+	 */
+	if (have_evmcs) {
+		state->flags = KVM_STATE_NESTED_EVMCS;
+		test_nested_state_expect_einval(vcpu, state);
+		vcpu_enable_evmcs(vcpu);
+		test_nested_state(vcpu, state);
+	}
+
+	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
+	state->hdr.vmx.smm.flags = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Invalid flags are rejected. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
+	 * KVM_STATE_NESTED_GUEST_MODE set together.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
+		      KVM_STATE_NESTED_RUN_PENDING;
+	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * It is invalid to have any of the SMM flags set besides:
+	 *	KVM_STATE_NESTED_SMM_GUEST_MODE
+	 *	KVM_STATE_NESTED_SMM_VMXON
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
+				KVM_STATE_NESTED_SMM_VMXON);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Outside SMM, SMM flags must be zero. */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Size must be large enough to fit kvm_nested_state and vmcs12
+	 * if VMCS12 physical address is set
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+	 * contents but L2 not running.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	/* Invalid flags are rejected, even if no VMCS loaded. */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* vmxon_pa cannot be the same address as vmcs_pa. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = 0;
+	state->hdr.vmx.vmcs12_pa = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Test that if we leave nesting the state reflects that when we get
+	 * it again.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	vcpu_nested_state_get(vcpu, state);
+	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+		    "Size must be between %ld and %d.  The size returned was %d.",
+		    sizeof(*state), state_sz, state->size);
+
+	TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull);
+	TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull);
+	TEST_ASSERT_EQ(state->flags, 0);
+
+	free(state);
+}
+
+static void vcpu_efer_enable_svm(struct kvm_vcpu *vcpu)
+{
+	uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER);
+
+	vcpu_set_msr(vcpu, MSR_EFER, old_efer | EFER_SVME);
+}
+
+static void vcpu_efer_disable_svm(struct kvm_vcpu *vcpu)
+{
+	uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER);
+
+	vcpu_set_msr(vcpu, MSR_EFER, old_efer & ~EFER_SVME);
+}
+
+void set_default_svm_state(struct kvm_nested_state *state, int size)
+{
+	memset(state, 0, size);
+	state->format = 1;
+	state->size = size;
+	state->hdr.svm.vmcb_pa = 0x3000;
+}
+
+void test_svm_nested_state(struct kvm_vcpu *vcpu)
+{
+	/* Add a page for VMCB. */
+	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+	struct kvm_nested_state *state =
+		(struct kvm_nested_state *)malloc(state_sz);
+
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_SVM);
+
+	/* The format must be set to 1. 0 for VMX, 1 for SVM. */
+	set_default_svm_state(state, state_sz);
+	state->format = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Invalid flags are rejected, KVM_STATE_NESTED_EVMCS is VMX-only  */
+	set_default_svm_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_EVMCS;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * If EFER.SVME is clear, guest mode is disallowed and GIF can be set or
+	 * cleared.
+	 */
+	vcpu_efer_disable_svm(vcpu);
+
+	set_default_svm_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	state->flags = KVM_STATE_NESTED_GIF_SET;
+	test_nested_state(vcpu, state);
+
+	/* Enable SVM in the guest EFER. */
+	vcpu_efer_enable_svm(vcpu);
+
+	/* Setting vmcb_pa to a non-aligned address is only fine when not entering guest mode */
+	set_default_svm_state(state, state_sz);
+	state->hdr.svm.vmcb_pa = -1ull;
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Size must be large enough to fit kvm_nested_state and VMCB
+	 * only when entering guest mode.
+	 */
+	set_default_svm_state(state, state_sz/2);
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Test that if we leave nesting the state reflects that when we get it
+	 * again, except for vmcb_pa, which is always returned as 0 when not in
+	 * guest mode.
+	 */
+	set_default_svm_state(state, state_sz);
+	state->hdr.svm.vmcb_pa = -1ull;
+	state->flags = KVM_STATE_NESTED_GIF_SET;
+	test_nested_state(vcpu, state);
+	vcpu_nested_state_get(vcpu, state);
+	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+		    "Size must be between %ld and %d.  The size returned was %d.",
+		    sizeof(*state), state_sz, state->size);
+
+	TEST_ASSERT_EQ(state->hdr.svm.vmcb_pa, 0);
+	TEST_ASSERT_EQ(state->flags, KVM_STATE_NESTED_GIF_SET);
+
+	free(state);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_nested_state state;
+	struct kvm_vcpu *vcpu;
+
+	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) ||
+		     kvm_cpu_has(X86_FEATURE_SVM));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	/*
+	 * First run tests with VMX/SVM disabled to check error handling.
+	 * test_{vmx/svm}_nested_state() will re-enable as needed.
+	 */
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
+	else
+		vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_SVM);
+
+	/* Passing a NULL kvm_nested_state causes a EFAULT. */
+	test_nested_state_expect_efault(vcpu, NULL);
+
+	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
+	set_default_state(&state);
+	state.size = 0;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	/*
+	 * Setting the flags 0xf fails the flags check.  The only flags that
+	 * can be used are:
+	 *     KVM_STATE_NESTED_GUEST_MODE
+	 *     KVM_STATE_NESTED_RUN_PENDING
+	 *     KVM_STATE_NESTED_EVMCS
+	 */
+	set_default_state(&state);
+	state.flags = 0xf;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	/*
+	 * If KVM_STATE_NESTED_RUN_PENDING is set then
+	 * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
+	 */
+	set_default_state(&state);
+	state.flags = KVM_STATE_NESTED_RUN_PENDING;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		test_vmx_nested_state(vcpu);
+	else
+		test_svm_nested_state(vcpu);
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
deleted file mode 100644
index b59a8a17084d..000000000000
--- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
+++ /dev/null
@@ -1,306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_set_nested_state_test
- *
- * Copyright (C) 2019, Google LLC.
- *
- * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <errno.h>
-#include <linux/kvm.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <unistd.h>
-
-/*
- * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
- * changes this should be updated.
- */
-#define VMCS12_REVISION 0x11e57ed0
-
-bool have_evmcs;
-
-void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
-{
-	vcpu_nested_state_set(vcpu, state);
-}
-
-void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
-				    struct kvm_nested_state *state,
-				    int expected_errno)
-{
-	int rv;
-
-	rv = __vcpu_nested_state_set(vcpu, state);
-	TEST_ASSERT(rv == -1 && errno == expected_errno,
-		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
-		strerror(expected_errno), expected_errno, rv, strerror(errno),
-		errno);
-}
-
-void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
-				     struct kvm_nested_state *state)
-{
-	test_nested_state_expect_errno(vcpu, state, EINVAL);
-}
-
-void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
-				     struct kvm_nested_state *state)
-{
-	test_nested_state_expect_errno(vcpu, state, EFAULT);
-}
-
-void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
-				u32 vmcs12_revision)
-{
-	/* Set revision_id in vmcs12 to vmcs12_revision. */
-	memcpy(&state->data, &vmcs12_revision, sizeof(u32));
-}
-
-void set_default_state(struct kvm_nested_state *state)
-{
-	memset(state, 0, sizeof(*state));
-	state->flags = KVM_STATE_NESTED_RUN_PENDING |
-		       KVM_STATE_NESTED_GUEST_MODE;
-	state->format = 0;
-	state->size = sizeof(*state);
-}
-
-void set_default_vmx_state(struct kvm_nested_state *state, int size)
-{
-	memset(state, 0, size);
-	if (have_evmcs)
-		state->flags = KVM_STATE_NESTED_EVMCS;
-	state->format = 0;
-	state->size = size;
-	state->hdr.vmx.vmxon_pa = 0x1000;
-	state->hdr.vmx.vmcs12_pa = 0x2000;
-	state->hdr.vmx.smm.flags = 0;
-	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
-}
-
-void test_vmx_nested_state(struct kvm_vcpu *vcpu)
-{
-	/* Add a page for VMCS12. */
-	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
-	struct kvm_nested_state *state =
-		(struct kvm_nested_state *)malloc(state_sz);
-
-	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
-	set_default_vmx_state(state, state_sz);
-	state->format = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * We cannot virtualize anything if the guest does not have VMX
-	 * enabled.
-	 */
-	set_default_vmx_state(state, state_sz);
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * We cannot virtualize anything if the guest does not have VMX
-	 * enabled.  We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
-	 * is set to -1ull, but the flags must be zero.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	test_nested_state_expect_einval(vcpu, state);
-
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	state->flags = KVM_STATE_NESTED_EVMCS;
-	test_nested_state_expect_einval(vcpu, state);
-
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-
-	/* Enable VMX in the guest CPUID. */
-	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
-
-	/*
-	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
-	 * setting the nested state. When the eVMCS flag is not set, the
-	 * expected return value is '0'.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	test_nested_state(vcpu, state);
-
-	/*
-	 * When eVMCS is supported, the eVMCS flag can only be set if the
-	 * enlightened VMCS capability has been enabled.
-	 */
-	if (have_evmcs) {
-		state->flags = KVM_STATE_NESTED_EVMCS;
-		test_nested_state_expect_einval(vcpu, state);
-		vcpu_enable_evmcs(vcpu);
-		test_nested_state(vcpu, state);
-	}
-
-	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
-	state->hdr.vmx.smm.flags = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* Invalid flags are rejected. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.flags = ~0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->flags = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
-	 * KVM_STATE_NESTED_GUEST_MODE set together.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
-		      KVM_STATE_NESTED_RUN_PENDING;
-	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * It is invalid to have any of the SMM flags set besides:
-	 *	KVM_STATE_NESTED_SMM_GUEST_MODE
-	 *	KVM_STATE_NESTED_SMM_VMXON
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
-				KVM_STATE_NESTED_SMM_VMXON);
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* Outside SMM, SMM flags must be zero. */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * Size must be large enough to fit kvm_nested_state and vmcs12
-	 * if VMCS12 physical address is set
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	state->hdr.vmx.vmcs12_pa = -1;
-	test_nested_state(vcpu, state);
-
-	/*
-	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
-	 * contents but L2 not running.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-
-	/* Invalid flags are rejected, even if no VMCS loaded. */
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	state->hdr.vmx.vmcs12_pa = -1;
-	state->hdr.vmx.flags = ~0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* vmxon_pa cannot be the same address as vmcs_pa. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = 0;
-	state->hdr.vmx.vmcs12_pa = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * Test that if we leave nesting the state reflects that when we get
-	 * it again.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-	vcpu_nested_state_get(vcpu, state);
-	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
-		    "Size must be between %ld and %d.  The size returned was %d.",
-		    sizeof(*state), state_sz, state->size);
-
-	TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull);
-	TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull);
-	TEST_ASSERT_EQ(state->flags, 0);
-
-	free(state);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	struct kvm_nested_state state;
-	struct kvm_vcpu *vcpu;
-
-	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
-
-	/*
-	 * AMD currently does not implement set_nested_state, so for now we
-	 * just early out.
-	 */
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	/*
-	 * First run tests with VMX disabled to check error handling.
-	 */
-	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
-
-	/* Passing a NULL kvm_nested_state causes a EFAULT. */
-	test_nested_state_expect_efault(vcpu, NULL);
-
-	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
-	set_default_state(&state);
-	state.size = 0;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	/*
-	 * Setting the flags 0xf fails the flags check.  The only flags that
-	 * can be used are:
-	 *     KVM_STATE_NESTED_GUEST_MODE
-	 *     KVM_STATE_NESTED_RUN_PENDING
-	 *     KVM_STATE_NESTED_EVMCS
-	 */
-	set_default_state(&state);
-	state.flags = 0xf;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	/*
-	 * If KVM_STATE_NESTED_RUN_PENDING is set then
-	 * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
-	 */
-	set_default_state(&state);
-	state.flags = KVM_STATE_NESTED_RUN_PENDING;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	test_vmx_nested_state(vcpu);
-
-	kvm_vm_free(vm);
-	return 0;
-}
-- 
cgit v1.2.3


From 9587dd7a7ebd7be3c36815a4c4f90f7e2cedbe03 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 8 Jan 2026 19:31:00 -0800
Subject: KVM: SVM: Drop the module param to control SEV-ES DebugSwap

Rip out the DebugSwap module param, as the sequence of events that led to
its inclusion was one big mistake, the param no longer serves any purpose.

Commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES") goofed
by not adding a way for the userspace VMM to control the feature.
Functionally, that was fine, but it broke attestation signatures because
SEV_FEATURES are included in the signature.

Commit 5abf6dceb066 ("SEV: disable SEV-ES DebugSwap by default") fixed that
issue, but the underlying flaw of userspace not having a way to control
SEV_FEATURES was still there.

That flaw was addressed by commit 4f5defae7089 ("KVM: SEV: introduce
KVM_SEV_INIT2 operation"), and so then 4dd5ecacb9a4 ("KVM: SEV: allow
SEV-ES DebugSwap again") re-enabled DebugSwap by default.

Now that the dust has settled, the module param doesn't serve any
meaningful purpose.

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://patch.msgid.link/20260109033101.1005769-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/sev.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f59c65abe3cf..9b92f0cccfe6 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -53,9 +53,6 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444);
 static bool sev_snp_enabled = true;
 module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
 
-/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = true;
-module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
 static u64 sev_supported_vmsa_features;
 
 static unsigned int nr_ciphertext_hiding_asids;
@@ -3150,12 +3147,10 @@ out:
 	sev_es_enabled = sev_es_supported;
 	sev_snp_enabled = sev_snp_supported;
 
-	if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
-	    !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
-		sev_es_debug_swap_enabled = false;
-
 	sev_supported_vmsa_features = 0;
-	if (sev_es_debug_swap_enabled)
+
+	if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) &&
+	    cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
 		sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
 
 	if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC))
-- 
cgit v1.2.3


From d23051f59a5b4eb1f6163cf27e07b8cfcaeb4758 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 8 Jan 2026 19:31:01 -0800
Subject: KVM: SVM: Tag sev_supported_vmsa_features as read-only after init

Tag sev_supported_vmsa_features with __ro_after_init as it's configured by
sev_hardware_setup() and never written after initial configuration (and if
it were, that'd be a blatant bug).

Opportunistically relocate the variable out of the module params area now
that sev_es_debug_swap_enabled is gone (which largely motivated its
original location).

Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://patch.msgid.link/20260109033101.1005769-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/sev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9b92f0cccfe6..28150506b18c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -53,8 +53,6 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444);
 static bool sev_snp_enabled = true;
 module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
 
-static u64 sev_supported_vmsa_features;
-
 static unsigned int nr_ciphertext_hiding_asids;
 module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444);
 
@@ -81,6 +79,8 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04
 
 static u64 snp_supported_policy_bits __ro_after_init;
 
+static u64 sev_supported_vmsa_features __ro_after_init;
+
 #define INITIAL_VMSA_GPA 0xFFFFFFFFF000
 
 static u8 sev_enc_bit;
-- 
cgit v1.2.3


From 217463aa329ea9a2efafd1bbfa6787e8df9091b9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:40 -0800
Subject: KVM: SVM: Add a helper to detect VMRUN failures

Add a helper to detect VMRUN failures so that KVM can guard against its
own long-standing bug, where KVM neglects to set exitcode[63:32] when
synthesizing a nested VMFAIL_INVALID VM-Exit.  This will allow fixing
KVM's mess of treating exitcode as two separate 32-bit values without
breaking KVM-on-KVM when running on an older, unfixed KVM.

Cc: Jim Mattson <jmattson@google.com>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230211347.4099600-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 16 +++++++---------
 arch/x86/kvm/svm/svm.c    |  4 ++--
 arch/x86/kvm/svm/svm.h    |  5 +++++
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 5b741f8ed170..666b5a36c15d 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1167,7 +1167,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
 	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
 
-	if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+	if (!svm_is_vmrun_failure(vmcb12->control.exit_code))
 		nested_save_pending_event_to_vmcb12(svm, vmcb12);
 
 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
@@ -1463,6 +1463,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	u32 exit_code = svm->vmcb->control.exit_code;
 	int vmexit = NESTED_EXIT_HOST;
 
+	if (svm_is_vmrun_failure(exit_code))
+		return NESTED_EXIT_DONE;
+
 	switch (exit_code) {
 	case SVM_EXIT_MSR:
 		vmexit = nested_svm_exit_handled_msr(svm);
@@ -1470,7 +1473,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	case SVM_EXIT_IOIO:
 		vmexit = nested_svm_intercept_ioio(svm);
 		break;
-	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f:
 		/*
 		 * Host-intercepted exceptions have been checked already in
 		 * nested_svm_exit_special.  There is nothing to do here,
@@ -1478,15 +1481,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 		 */
 		vmexit = NESTED_EXIT_DONE;
 		break;
-	}
-	case SVM_EXIT_ERR: {
-		vmexit = NESTED_EXIT_DONE;
-		break;
-	}
-	default: {
+	default:
 		if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
 			vmexit = NESTED_EXIT_DONE;
-	}
+		break;
 	}
 
 	return vmexit;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c7bd78f5a2c7..e20b40f346af 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3564,7 +3564,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 			return 1;
 	}
 
-	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+	if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= svm->vmcb->control.exit_code;
@@ -4346,7 +4346,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 
 		/* Track VMRUNs that have made past consistency checking */
 		if (svm->nested.nested_run_pending &&
-		    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+		    !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
                         ++vcpu->stat.nested_run;
 
 		svm->nested.nested_run_pending = 0;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7d28a739865f..3360ac36e071 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -425,6 +425,11 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
+static inline bool svm_is_vmrun_failure(u64 exit_code)
+{
+	return (u32)exit_code == (u32)SVM_EXIT_ERR;
+}
+
 /*
  * Only the PDPTRs are loaded on demand into the shadow MMU.  All other
  * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
-- 
cgit v1.2.3


From 2450c9774510e45c506df4a1b46d129435993ff6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:41 -0800
Subject: KVM: SVM: Open code handling of unexpected exits in
 svm_invoke_exit_handler()

Fold svm_check_exit_valid() and svm_handle_invalid_exit() into their sole
caller, svm_invoke_exit_handler(), as having tiny single-use helpers makes
the code unncessarily difficult to follow.  This will also allow for
additional cleanups in svm_invoke_exit_handler().

No functional change intended.

Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Link: https://patch.msgid.link/20251230211347.4099600-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e20b40f346af..ddb07c6408de 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3467,23 +3467,13 @@ no_vmsa:
 		sev_free_decrypted_vmsa(vcpu, save);
 }
 
-static bool svm_check_exit_valid(u64 exit_code)
-{
-	return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
-		svm_exit_handlers[exit_code]);
-}
-
-static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
-{
-	dump_vmcb(vcpu);
-	kvm_prepare_unexpected_reason_exit(vcpu, exit_code);
-	return 0;
-}
-
 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-	if (!svm_check_exit_valid(exit_code))
-		return svm_handle_invalid_exit(vcpu, exit_code);
+	if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
+		goto unexpected_vmexit;
+
+	if (!svm_exit_handlers[exit_code])
+		goto unexpected_vmexit;
 
 #ifdef CONFIG_MITIGATION_RETPOLINE
 	if (exit_code == SVM_EXIT_MSR)
@@ -3502,6 +3492,11 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 #endif
 #endif
 	return svm_exit_handlers[exit_code](vcpu);
+
+unexpected_vmexit:
+	dump_vmcb(vcpu);
+	kvm_prepare_unexpected_reason_exit(vcpu, exit_code);
+	return 0;
 }
 
 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
-- 
cgit v1.2.3


From 194c17bf5ebadd2fcf52ac641793e3d755a7af55 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:42 -0800
Subject: KVM: SVM: Check for an unexpected VM-Exit after RETPOLINE "fast"
 handling

Check for an unexpected/unhandled VM-Exit after the manual RETPOLINE=y
handling.  The entire point of the RETPOLINE checks is to optimize for
common VM-Exits, i.e. checking for the rare case of an unsupported
VM-Exit is counter-productive.  This also aligns SVM and VMX exit handling.

No functional change intended.

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230211347.4099600-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ddb07c6408de..d2f997965a96 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3469,12 +3469,6 @@ no_vmsa:
 
 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-	if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
-		goto unexpected_vmexit;
-
-	if (!svm_exit_handlers[exit_code])
-		goto unexpected_vmexit;
-
 #ifdef CONFIG_MITIGATION_RETPOLINE
 	if (exit_code == SVM_EXIT_MSR)
 		return msr_interception(vcpu);
@@ -3491,6 +3485,12 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 		return sev_handle_vmgexit(vcpu);
 #endif
 #endif
+	if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
+		goto unexpected_vmexit;
+
+	if (!svm_exit_handlers[exit_code])
+		goto unexpected_vmexit;
+
 	return svm_exit_handlers[exit_code](vcpu);
 
 unexpected_vmexit:
-- 
cgit v1.2.3


From 405fce694bd1589082a7ffd500b5a4b841c22f0d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:43 -0800
Subject: KVM: SVM: Filter out 64-bit exit codes when invoking exit handlers on
 bare metal

Explicitly filter out 64-bit exit codes when invoking exit handlers, as
svm_exit_handlers[] will never be sized with entries that use bits 63:32.

Processing the non-failing exit code as a 32-bit value will allow tracking
exit_code as a single 64-bit value (which it is, architecturally).  This
will also allow hardening KVM against Spectre-like attacks without needing
to do silly things to avoid build failures on 32-bit kernels
(array_index_nospec() rightly asserts that the index fits in an "unsigned
long").

Omit the check when running as a VM, as KVM has historically failed to set
bits 63:32 appropriately when synthesizing VM-Exits, i.e. KVM could get
false positives when running as a VM on an older, broken KVM/kernel.  From
a functional perspective, omitting the check is "fine", as any unwanted
collision between e.g. VMEXIT_INVALID and a 32-bit exit code will be
fatal to KVM-on-KVM regardless of what KVM-as-L1 does.

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230211347.4099600-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d2f997965a96..3caf7a21679f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3467,8 +3467,22 @@ no_vmsa:
 		sev_free_decrypted_vmsa(vcpu, save);
 }
 
-int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
 {
+	u32 exit_code = __exit_code;
+
+	/*
+	 * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN
+	 * failed.  Report all such errors to userspace (note, VMEXIT_INVALID,
+	 * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()).  Skip
+	 * the check when running as a VM, as KVM has historically left garbage
+	 * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if
+	 * the underlying kernel is buggy.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) &&
+	    (u64)exit_code != __exit_code)
+		goto unexpected_vmexit;
+
 #ifdef CONFIG_MITIGATION_RETPOLINE
 	if (exit_code == SVM_EXIT_MSR)
 		return msr_interception(vcpu);
@@ -3495,7 +3509,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 
 unexpected_vmexit:
 	dump_vmcb(vcpu);
-	kvm_prepare_unexpected_reason_exit(vcpu, exit_code);
+	kvm_prepare_unexpected_reason_exit(vcpu, __exit_code);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:44 -0800
Subject: KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of
 KVM

Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit
value.  Per the APM and Xen commit d1bd157fbc ("Big merge the HVM
full-virtualisation abstractions.") (which is arguably more trustworthy
than KVM), offset 0x70 is a single 64-bit value:

  070h 63:0 EXITCODE

Track exit_code as a single u64 to prevent reintroducing bugs where KVM
neglects to correctly set bits 63:32.

Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface")
Cc: Jim Mattson <jmattson@google.com>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/svm.h                         |  3 +-
 arch/x86/include/uapi/asm/svm.h                    | 32 +++++++++----------
 arch/x86/kvm/svm/hyperv.c                          |  1 -
 arch/x86/kvm/svm/nested.c                          | 13 ++------
 arch/x86/kvm/svm/sev.c                             | 36 ++++++++--------------
 arch/x86/kvm/svm/svm.c                             |  7 ++---
 arch/x86/kvm/svm/svm.h                             |  4 +--
 arch/x86/kvm/trace.h                               |  6 ++--
 include/hyperv/hvgdk.h                             |  2 +-
 tools/testing/selftests/kvm/include/x86/svm.h      |  3 +-
 .../kvm/x86/svm_nested_soft_inject_test.c          |  4 +--
 11 files changed, 42 insertions(+), 69 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 50ece197c98a..edde36097ddc 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 int_vector;
 	u32 int_state;
 	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
+	u64 exit_code;
 	u64 exit_info_1;
 	u64 exit_info_2;
 	u32 exit_int_info;
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 650e3256ea7d..010a45c9f614 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -103,38 +103,38 @@
 #define SVM_EXIT_VMGEXIT       0x403
 
 /* SEV-ES software-defined VMGEXIT events */
-#define SVM_VMGEXIT_MMIO_READ			0x80000001
-#define SVM_VMGEXIT_MMIO_WRITE			0x80000002
-#define SVM_VMGEXIT_NMI_COMPLETE		0x80000003
-#define SVM_VMGEXIT_AP_HLT_LOOP			0x80000004
-#define SVM_VMGEXIT_AP_JUMP_TABLE		0x80000005
+#define SVM_VMGEXIT_MMIO_READ			0x80000001ull
+#define SVM_VMGEXIT_MMIO_WRITE			0x80000002ull
+#define SVM_VMGEXIT_NMI_COMPLETE		0x80000003ull
+#define SVM_VMGEXIT_AP_HLT_LOOP			0x80000004ull
+#define SVM_VMGEXIT_AP_JUMP_TABLE		0x80000005ull
 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE		0
 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE		1
-#define SVM_VMGEXIT_PSC				0x80000010
-#define SVM_VMGEXIT_GUEST_REQUEST		0x80000011
-#define SVM_VMGEXIT_EXT_GUEST_REQUEST		0x80000012
-#define SVM_VMGEXIT_AP_CREATION			0x80000013
+#define SVM_VMGEXIT_PSC				0x80000010ull
+#define SVM_VMGEXIT_GUEST_REQUEST		0x80000011ull
+#define SVM_VMGEXIT_EXT_GUEST_REQUEST		0x80000012ull
+#define SVM_VMGEXIT_AP_CREATION			0x80000013ull
 #define SVM_VMGEXIT_AP_CREATE_ON_INIT		0
 #define SVM_VMGEXIT_AP_CREATE			1
 #define SVM_VMGEXIT_AP_DESTROY			2
-#define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018
-#define SVM_VMGEXIT_SAVIC			0x8000001a
+#define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018ull
+#define SVM_VMGEXIT_SAVIC			0x8000001aull
 #define SVM_VMGEXIT_SAVIC_REGISTER_GPA		0
 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA	1
 #define SVM_VMGEXIT_SAVIC_SELF_GPA		~0ULL
-#define SVM_VMGEXIT_HV_FEATURES			0x8000fffd
-#define SVM_VMGEXIT_TERM_REQUEST		0x8000fffe
+#define SVM_VMGEXIT_HV_FEATURES			0x8000fffdull
+#define SVM_VMGEXIT_TERM_REQUEST		0x8000fffeull
 #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code)	\
 	/* SW_EXITINFO1[3:0] */					\
 	(((((u64)reason_set) & 0xf)) |				\
 	/* SW_EXITINFO1[11:4] */				\
 	((((u64)reason_code) & 0xff) << 4))
-#define SVM_VMGEXIT_UNSUPPORTED_EVENT		0x8000ffff
+#define SVM_VMGEXIT_UNSUPPORTED_EVENT		0x8000ffffull
 
 /* Exit code reserved for hypervisor/software use */
-#define SVM_EXIT_SW				0xf0000000
+#define SVM_EXIT_SW				0xf0000000ull
 
-#define SVM_EXIT_ERR           -1
+#define SVM_EXIT_ERR           -1ull
 
 #define SVM_EXIT_REASONS \
 	{ SVM_EXIT_READ_CR0,    "read_cr0" }, \
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
index 088f6429b24c..3ec580d687f5 100644
--- a/arch/x86/kvm/svm/hyperv.c
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
-	svm->vmcb->control.exit_code_hi = 0;
 	svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
 	svm->vmcb->control.exit_info_2 = 0;
 	nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 666b5a36c15d..5aa0512e09c9 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 		 * correctly fill in the high bits of exit_info_1.
 		 */
 		vmcb->control.exit_code = SVM_EXIT_NPF;
-		vmcb->control.exit_code_hi = 0;
 		vmcb->control.exit_info_1 = (1ULL << 32);
 		vmcb->control.exit_info_2 = fault->address;
 	}
@@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
 	to->int_vector          = from->int_vector;
 	to->int_state           = from->int_state;
 	to->exit_code           = from->exit_code;
-	to->exit_code_hi        = from->exit_code_hi;
 	to->exit_info_1         = from->exit_info_1;
 	to->exit_info_2         = from->exit_info_2;
 	to->exit_int_info       = from->exit_int_info;
@@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
 	enter_guest_mode(vcpu);
 
 	/*
-	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
-	 * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+	 * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info,
+	 * exit_int_info_err, next_rip, insn_len, insn_bytes.
 	 */
 
 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
@@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 	if (!nested_vmcb_check_save(vcpu) ||
 	    !nested_vmcb_check_controls(vcpu)) {
 		vmcb12->control.exit_code    = SVM_EXIT_ERR;
-		vmcb12->control.exit_code_hi = -1u;
 		vmcb12->control.exit_info_1  = 0;
 		vmcb12->control.exit_info_2  = 0;
 		goto out;
@@ -1051,7 +1048,6 @@ out_exit_err:
 	svm->soft_int_injected = false;
 
 	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
-	svm->vmcb->control.exit_code_hi = -1u;
 	svm->vmcb->control.exit_info_1  = 0;
 	svm->vmcb->control.exit_info_2  = 0;
 
@@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	vmcb12->control.int_state         = vmcb02->control.int_state;
 	vmcb12->control.exit_code         = vmcb02->control.exit_code;
-	vmcb12->control.exit_code_hi      = vmcb02->control.exit_code_hi;
 	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
 	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
 
@@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 
 static int nested_svm_intercept(struct vcpu_svm *svm)
 {
-	u32 exit_code = svm->vmcb->control.exit_code;
+	u64 exit_code = svm->vmcb->control.exit_code;
 	int vmexit = NESTED_EXIT_HOST;
 
 	if (svm_is_vmrun_failure(exit_code))
@@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
 	struct vmcb *vmcb = svm->vmcb;
 
 	vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
-	vmcb->control.exit_code_hi = 0;
 
 	if (ex->has_error_code)
 		vmcb->control.exit_info_1 = ex->error_code;
@@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
 	dst->int_vector           = from->int_vector;
 	dst->int_state            = from->int_state;
 	dst->exit_code            = from->exit_code;
-	dst->exit_code_hi         = from->exit_code_hi;
 	dst->exit_info_1          = from->exit_info_1;
 	dst->exit_info_2          = from->exit_info_2;
 	dst->exit_int_info        = from->exit_int_info;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 28150506b18c..f67525007089 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3270,11 +3270,6 @@ skip_vmsa_free:
 		kvfree(svm->sev_es.ghcb_sa);
 }
 
-static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control)
-{
-	return (((u64)control->exit_code_hi) << 32) | control->exit_code;
-}
-
 static void dump_ghcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm)
 	 */
 	pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
-	       kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
+	       control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
 	       control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
@@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct ghcb *ghcb = svm->sev_es.ghcb;
-	u64 exit_code;
 
 	/*
 	 * The GHCB protocol so far allows for the following data
@@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
 		__kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm));
 
 	/* Copy the GHCB exit information into the VMCB fields */
-	exit_code = kvm_ghcb_get_sw_exit_code(svm);
-	control->exit_code = lower_32_bits(exit_code);
-	control->exit_code_hi = upper_32_bits(exit_code);
+	control->exit_code = kvm_ghcb_get_sw_exit_code(svm);
 	control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm);
 	control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm);
 	svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm);
@@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
-	u64 exit_code;
 	u64 reason;
 
-	/*
-	 * Retrieve the exit code now even though it may not be marked valid
-	 * as it could help with debugging.
-	 */
-	exit_code = kvm_get_cached_sw_exit_code(control);
-
 	/* Only GHCB Usage code 0 is supported */
 	if (svm->sev_es.ghcb->ghcb_usage) {
 		reason = GHCB_ERR_INVALID_USAGE;
@@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	    !kvm_ghcb_sw_exit_info_2_is_valid(svm))
 		goto vmgexit_err;
 
-	switch (exit_code) {
+	switch (control->exit_code) {
 	case SVM_EXIT_READ_DR7:
 		break;
 	case SVM_EXIT_WRITE_DR7:
@@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	return 0;
 
 vmgexit_err:
+	/*
+	 * Print the exit code even though it may not be marked valid as it
+	 * could help with debugging.
+	 */
 	if (reason == GHCB_ERR_INVALID_USAGE) {
 		vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
 			    svm->sev_es.ghcb->ghcb_usage);
 	} else if (reason == GHCB_ERR_INVALID_EVENT) {
 		vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
-			    exit_code);
+			    control->exit_code);
 	} else {
 		vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
-			    exit_code);
+			    control->exit_code);
 		dump_ghcb(svm);
 	}
 
@@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
-	u64 ghcb_gpa, exit_code;
+	u64 ghcb_gpa;
 	int ret;
 
 	/* Validate the GHCB */
@@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 
 	svm_vmgexit_success(svm, 0);
 
-	exit_code = kvm_get_cached_sw_exit_code(control);
-	switch (exit_code) {
+	switch (control->exit_code) {
 	case SVM_VMGEXIT_MMIO_READ:
 		ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
 		if (ret)
@@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		ret = -EINVAL;
 		break;
 	default:
-		ret = svm_invoke_exit_handler(vcpu, exit_code);
+		ret = svm_invoke_exit_handler(vcpu, control->exit_code);
 	}
 
 	return ret;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3caf7a21679f..a28cd61d87ea 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
 
 	if (cr0 ^ val) {
 		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-		svm->vmcb->control.exit_code_hi = 0;
 		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
 	}
 
@@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
 	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
 	pr_err("%-20s%08x\n", "int_state:", control->int_state);
-	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+	pr_err("%-20s%016llx\n", "exit_code:", control->exit_code);
 	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
 	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
 	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
@@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_run *kvm_run = vcpu->run;
-	u32 exit_code = svm->vmcb->control.exit_code;
 
 	/* SEV-ES guests must use the CR write traps to track CR registers. */
 	if (!sev_es_guest(vcpu->kvm)) {
@@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	if (exit_fastpath != EXIT_FASTPATH_NONE)
 		return 1;
 
-	return svm_invoke_exit_handler(vcpu, exit_code);
+	return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
 }
 
 static int pre_svm_run(struct kvm_vcpu *vcpu)
@@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 	if (static_cpu_has(X86_FEATURE_NRIPS))
 		vmcb->control.next_rip  = info->next_rip;
 	vmcb->control.exit_code = icpt_info.exit_code;
-	vmcb->control.exit_code_hi = 0;
 	vmexit = nested_svm_exit_handled(svm);
 
 	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 3360ac36e071..a22433680c73 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached {
 	u32 int_ctl;
 	u32 int_vector;
 	u32 int_state;
-	u32 exit_code;
-	u32 exit_code_hi;
+	u64 exit_code;
 	u64 exit_info_1;
 	u64 exit_info_2;
 	u32 exit_int_info;
@@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm);
 static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
 {
 	svm->vmcb->control.exit_code	= exit_code;
-	svm->vmcb->control.exit_code_hi	= 0;
 	svm->vmcb->control.exit_info_1	= 0;
 	svm->vmcb->control.exit_info_2	= 0;
 	return nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e79bc9cb7162..e7fdbe9efc90 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic,
 #define kvm_print_exit_reason(exit_reason, isa)				\
 	(isa == KVM_ISA_VMX) ?						\
 	__print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) :	\
-	__print_symbolic(exit_reason, SVM_EXIT_REASONS),		\
+	__print_symbolic_u64(exit_reason, SVM_EXIT_REASONS),		\
 	(isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "",	\
 	(isa == KVM_ISA_VMX) ?						\
-	__print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+	__print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
 
 #define TRACE_EVENT_KVM_EXIT(name)					     \
 TRACE_EVENT(name,							     \
@@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
  * Tracepoint for #VMEXIT reinjected to the guest
  */
 TRACE_EVENT(kvm_nested_vmexit_inject,
-	    TP_PROTO(__u32 exit_code,
+	    TP_PROTO(__u64 exit_code,
 		     __u64 exit_info1, __u64 exit_info2,
 		     __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
 	    TP_ARGS(exit_code, exit_info1, exit_info2,
diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h
index dd6d4939ea29..384c3f3ff4a5 100644
--- a/include/hyperv/hvgdk.h
+++ b/include/hyperv/hvgdk.h
@@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments {
 #define HV_VMCB_NESTED_ENLIGHTENMENTS		31
 
 /* Synthetic VM-Exit */
-#define HV_SVM_EXITCODE_ENL			0xf0000000
+#define HV_SVM_EXITCODE_ENL			0xf0000000ull
 #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH	(1)
 
 /* VM_PARTITION_ASSIST_PAGE */
diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h
index 29cffd0a9181..10b30b38bb3f 100644
--- a/tools/testing/selftests/kvm/include/x86/svm.h
+++ b/tools/testing/selftests/kvm/include/x86/svm.h
@@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 int_vector;
 	u32 int_state;
 	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
+	u64 exit_code;
 	u64 exit_info_1;
 	u64 exit_info_2;
 	u32 exit_int_info;
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
index 7b6481d6c0d3..4bd1655f9e6d 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
@@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i
 
 	run_guest(vmcb, svm->vmcb_gpa);
 	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL,
-		       "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'",
 		       vmcb->control.exit_code,
 		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
 
@@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i
 
 	run_guest(vmcb, svm->vmcb_gpa);
 	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT,
-		       "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'",
 		       vmcb->control.exit_code,
 		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
 
-- 
cgit v1.2.3


From a08ca6691fd3ab40e40eb6600193672d50c7a7ba Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:45 -0800
Subject: KVM: SVM: Limit incorrect check on SVM_EXIT_ERR to running as a VM

Limit KVM's incorrect check for VMXEXIT_INVALID, a.k.a. SVM_EXIT_ERR, to
running as a VM, as detected by X86_FEATURE_HYPERVISOR.  The exit_code and
all failure codes, e.g. VMXEXIT_INVALID, are 64-bit values, and so checking
only bits 31:0 could result in false positives when running on non-broken
hardware, e.g. in the extremely unlikely scenario exit code 0xffffffffull
is ever generated by hardware.

Keep the 32-bit check to play nice with running on broken KVM (for years,
KVM has not set bits 63:32 when synthesizing nested SVM VM-Exits).

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230211347.4099600-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a22433680c73..338fc4f5cc4c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -426,7 +426,10 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 
 static inline bool svm_is_vmrun_failure(u64 exit_code)
 {
-	return (u32)exit_code == (u32)SVM_EXIT_ERR;
+	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+		return (u32)exit_code == (u32)SVM_EXIT_ERR;
+
+	return exit_code == SVM_EXIT_ERR;
 }
 
 /*
-- 
cgit v1.2.3


From 1e3dddafeceeb8d2cd182b78456cb9ca9d042a01 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:46 -0800
Subject: KVM: SVM: Harden exit_code against being used in Spectre-like attacks

Explicitly clamp the exit code used to index KVM's exit handlers to guard
against Spectre-like attacks, mainly to provide consistency between VMX
and SVM (VMX was given the same treatment by commit c926f2f7230b ("KVM:
x86: Protect exit_reason from being used in Spectre-v1/L1TF attacks").

For normal VMs, it's _extremely_ unlikely the exit code could be used to
exploit a speculation vulnerability, as the exit code is set by hardware
and unexpected/unknown exit codes should be quite well bounded (as is/was
the case with VMX).  But with SEV-ES+, the exit code is guest-controlled
as it comes from the GHCB, not from hardware, i.e. an attack from the
guest is at least somewhat plausible.

Irrespective of SEV-ES+, hardening KVM is easy and inexpensive, and such
an attack is theoretically possible.

Link: https://patch.msgid.link/20251230211347.4099600-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a28cd61d87ea..e454ae095cf7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3501,6 +3501,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
 	if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
 		goto unexpected_vmexit;
 
+	exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers));
 	if (!svm_exit_handlers[exit_code])
 		goto unexpected_vmexit;
 
-- 
cgit v1.2.3


From d6c20d19f7d3de14d02b47221988cdb19504bb84 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:47 -0800
Subject: KVM: SVM: Assert that Hyper-V's HV_SVM_EXITCODE_ENL == SVM_EXIT_SW

Add a build-time assertiont that Hyper-V's "enlightened" exit code is that,
same as the AMD-defined "Reserved for Host" exit code, mostly to help
readers connect the dots and understand why synthesizing a software-defined
exit code is safe/ok.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://patch.msgid.link/20251230211347.4099600-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/hyperv.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
index 3ec580d687f5..4f24dcb45116 100644
--- a/arch/x86/kvm/svm/hyperv.c
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -10,6 +10,12 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	/*
+	 * The exit code used by Hyper-V for software-defined exits is reserved
+	 * by AMD specifically for such use cases.
+	 */
+	BUILD_BUG_ON(HV_SVM_EXITCODE_ENL != SVM_EXIT_SW);
+
 	svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
 	svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
 	svm->vmcb->control.exit_info_2 = 0;
-- 
cgit v1.2.3


From 69555130dccb39df4d40f90fafc7fc79a5d55b8a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 8 Jan 2026 19:50:37 -0800
Subject: KVM: SVM: Fix an off-by-one typo in the comment for enabling AVIC by
 default

Fix a goof in the comment that documents KVM's logic for enabling AVIC by
default to reference Zen5+ as family 0x1A (Zen5), not family 0x19 (Zen4).
The code is correct (checks for _greater_ than 0x19), only the comment is
flawed.

Opportunistically tweak the check too, even though it's already correct,
so that both the comment and the code reference 0x1A, and so that the
checks are "ascending", i.e. check Zen4 and then Zen5+.

No functional change intended.

Fixes: ca2967de5a5b ("KVM: SVM: Enable AVIC by default for Zen4+ if x2AVIC is support")
Acked-by: Naveen N Rao (AMD) <naveen@kernel.org>
Link: https://patch.msgid.link/20260109035037.1015073-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 6b77b2033208..e8acac56da5b 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -1224,13 +1224,13 @@ static bool __init avic_want_avic_enabled(void)
 	 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
 	 * supported (to avoid enabling partial support by default, and because
 	 * x2AVIC should be supported by all Zen4+ CPUs).  Explicitly check for
-	 * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags
+	 * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags
 	 * aren't inclusive of previous generations, i.e. the kernel will set
 	 * at most one ZenX feature flag.
 	 */
 	if (avic == AVIC_AUTO_MODE)
 		avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
-		       (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4));
+		       (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A);
 
 	if (!avic || !npt_enabled)
 		return false;
-- 
cgit v1.2.3


From f00ccdede3c84df2287e59b546fd92d58b7e07af Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 13 Jan 2026 17:28:07 +0000
Subject: KVM: nSVM: Drop redundant/wrong comment in
 nested_vmcb02_prepare_save()

The comment above DR6 and DR7 initializations is redundant, because the
entire function follows the same pattern of only initializing the fields
in vmcb02 if the vmcb12 changed or the fields are dirty, which handles
the first execution case.

Also, the comment refers to new_vmcb12 as new_vmcs12. Just drop the
comment.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20260113172807.2178526-1-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 5aa0512e09c9..79cb85b8a156 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -681,7 +681,6 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
 	vmcb02->save.rsp = vmcb12->save.rsp;
 	vmcb02->save.rip = vmcb12->save.rip;
 
-	/* These bits will be set properly on the first execution when new_vmc12 is true */
 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
 		vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
 		svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
-- 
cgit v1.2.3


From 127ccae2c185f62e6ecb4bf24f9cb307e9b9c619 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Sat, 10 Jan 2026 00:48:18 +0000
Subject: KVM: nSVM: Always use vmcb01 in VMLOAD/VMSAVE emulation

Commit cc3ed80ae69f ("KVM: nSVM: always use vmcb01 to for vmsave/vmload
of guest state") made KVM always use vmcb01 for the fields controlled by
VMSAVE/VMLOAD, but it missed updating the VMLOAD/VMSAVE emulation code
to always use vmcb01.

As a result, if VMSAVE/VMLOAD is executed by an L2 guest and is not
intercepted by L1, KVM will mistakenly use vmcb02. Always use vmcb01
instead of the current VMCB.

Fixes: cc3ed80ae69f ("KVM: nSVM: always use vmcb01 to for vmsave/vmload of guest state")
Cc: Maxim Levitsky <mlevitsk@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20260110004821.3411245-2-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e454ae095cf7..f1a5b61bdb5b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2122,12 +2122,13 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
 
 	ret = kvm_skip_emulated_instruction(vcpu);
 
+	/* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */
 	if (vmload) {
-		svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
+		svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12);
 		svm->sysenter_eip_hi = 0;
 		svm->sysenter_esp_hi = 0;
 	} else {
-		svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+		svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr);
 	}
 
 	kvm_vcpu_unmap(vcpu, &map);
-- 
cgit v1.2.3


From 55780d8a1dcc93d2c4b33c565ada88df12c9f206 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Sat, 10 Jan 2026 00:48:19 +0000
Subject: KVM: SVM: Stop toggling virtual VMSAVE/VMLOAD on intercept recalc

Virtual VMSAVE/VMLOAD enablement (i.e.
VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK) is set/cleared by
svm_recalc_instruction_intercepts() when the intercepts are cleared/set.
This is unnecessary because the bit is meaningless when intercepts are
set and KVM emulates the instructions. Initialize the bit in vmcb01 base
on vls, and keep it unchanged.

This is similar-ish to how vGIF is handled. It is enabled in init_vmcb()
if vgif=1 and remains unchanged when the STGI intercept is enabled (e.g.
for NMI windows).

This fixes a bug in svm_recalc_instruction_intercepts(). The intercepts
for VMSAVE/VMLOAD are always toggled in vmcb01, but
VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is toggled in the current VMCB, which
could be vmcb02 instead of vmcb01 if L2 is active.

Virtual VMSAVE/VMLOAD enablement in vmcb02 is separately controlled by
nested_vmcb02_prepare_control() based on the vCPU features and VMCB12,
and if intercepts are needed they are set by recalc_intercepts().

The bug is benign though. Not toggling the bit for vmcb01 is harmless
because it's useless anyway. For vmcb02:

- The bit could be incorrectly cleared when intercepts are set in
  vmcb01. This is harmless because VMSAVE/VMLOAD will be emulated by KVM
  anyway.

- The bit could be incorrectly set when the intercepts are cleared in
  vmcb01. However, if the bit was originally clear in vmcb02, then
  recalc_intercepts() will enable in the intercepts in vmcb02 anyway and
  VMSAVE/VMLOAD will be emulated by KVM.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20260110004821.3411245-3-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f1a5b61bdb5b..5eadecc5246c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -995,10 +995,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
 			svm_set_intercept(svm, INTERCEPT_RDTSCP);
 	}
 
+	/*
+	 * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is
+	 * always set if vls is enabled. If the intercepts are set, the bit is
+	 * meaningless anyway.
+	 */
 	if (guest_cpuid_is_intel_compatible(vcpu)) {
 		svm_set_intercept(svm, INTERCEPT_VMLOAD);
 		svm_set_intercept(svm, INTERCEPT_VMSAVE);
-		svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
 	} else {
 		/*
 		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1007,7 +1011,6 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
 		if (vls) {
 			svm_clr_intercept(svm, INTERCEPT_VMLOAD);
 			svm_clr_intercept(svm, INTERCEPT_VMSAVE);
-			svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
 		}
 	}
 }
@@ -1155,6 +1158,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
 		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
 	}
 
+	if (vls)
+		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
 	if (vcpu->kvm->arch.bus_lock_detection_enabled)
 		svm_set_intercept(svm, INTERCEPT_BUSLOCK);
 
-- 
cgit v1.2.3


From fa9893fadbc245e179cb17f3c371c67471b5a8a8 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 9 Jan 2026 17:17:32 -0600
Subject: KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching

For SEV-SNP, the host can optionally provide a certificate table to the
guest when it issues an attestation request to firmware (see GHCB 2.0
specification regarding "SNP Extended Guest Requests"). This certificate
table can then be used to verify the endorsement key used by firmware to
sign the attestation report.

While it is possible for guests to obtain the certificates through other
means, handling it via the host provides more flexibility in being able
to keep the certificate data in sync with the endorsement key throughout
host-side operations that might resulting in the endorsement key
changing.

In the case of KVM, userspace will be responsible for fetching the
certificate table and keeping it in sync with any modifications to the
endorsement key by other userspace management tools. Define a new
KVM_EXIT_SNP_REQ_CERTS event where userspace is provided with the GPA of
the buffer the guest has provided as part of the attestation request so
that userspace can write the certificate data into it while relying on
filesystem-based locking to keep the certificates up-to-date relative to
the endorsement keys installed/utilized by firmware at the time the
certificates are fetched.

[Melody: Update the documentation scheme about how file locking is
         expected to happen.]

Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Tested-by: Liam Merwick <liam.merwick@oracle.com>
Tested-by: Dionna Glaze <dionnaglaze@google.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Melody Wang <huibo.wang@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Link: https://patch.msgid.link/20260109231732.1160759-2-michael.roth@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/api.rst | 44 ++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/sev.c         | 62 ++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/svm/svm.h         |  1 +
 include/uapi/linux/kvm.h       |  9 ++++++
 4 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 01a3abef8abb..428d7d9cb4d6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7353,6 +7353,50 @@ Please note that the kernel is allowed to use the kvm_run structure as the
 primary storage for certain register types. Therefore, the kernel may use the
 values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 
+::
+
+		/* KVM_EXIT_SNP_REQ_CERTS */
+		struct kvm_exit_snp_req_certs {
+			__u64 gpa;
+			__u64 npages;
+			__u64 ret;
+		};
+
+KVM_EXIT_SNP_REQ_CERTS indicates an SEV-SNP guest with certificate-fetching
+enabled (see KVM_SEV_SNP_ENABLE_REQ_CERTS) has generated an Extended Guest
+Request NAE #VMGEXIT (SNP_GUEST_REQUEST) with message type MSG_REPORT_REQ,
+i.e. has requested an attestation report from firmware, and would like the
+certificate data corresponding to the attestation report signature to be
+provided by the hypervisor as part of the request.
+
+To allow for userspace to provide the certificate, the 'gpa' and 'npages'
+are forwarded verbatim from the guest request (the RAX and RBX GHCB fields
+respectively).  'ret' is not an "output" from KVM, and is always '0' on
+exit.  KVM verifies the 'gpa' is 4KiB aligned prior to exiting to userspace,
+but otherwise the information from the guest isn't validated.
+
+Upon the next KVM_RUN, e.g. after userspace has serviced the request (or not),
+KVM will complete the #VMGEXIT, using the 'ret' field to determine whether to
+signal success or failure to the guest, and on failure, what reason code will
+be communicated via SW_EXITINFO2.  If 'ret' is set to an unsupported value (see
+the table below), KVM_RUN will fail with -EINVAL.  For a 'ret' of 'ENOSPC', KVM
+also consumes the 'npages' field, i.e. userspace can use the field to inform
+the guest of the number of pages needed to hold all the certificate data.
+
+The supported 'ret' values and their respective SW_EXITINFO2 encodings:
+
+  ======     =============================================================
+  0          0x0, i.e. success.  KVM will emit an SNP_GUEST_REQUEST command
+             to SNP firmware.
+  ENOSPC     0x0000000100000000, i.e. not enough guest pages to hold the
+             certificate table and certificate data.  KVM will also set the
+             RBX field in the GHBC to 'npages'.
+  EAGAIN     0x0000000200000000, i.e. the host is busy and the guest should
+             retry the request.
+  EIO        0xffffffff00000000, for all other errors (this return code is
+             a KVM-defined hypervisor value, as allowed by the GHCB)
+  ======     =============================================================
+
 
 .. _cap_enable:
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f67525007089..9e6a78e448f2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -41,6 +41,16 @@
 
 #define GHCB_HV_FT_SUPPORTED	(GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
 
+/*
+ * The GHCB spec essentially states that all non-zero error codes other than
+ * those explicitly defined above should be treated as an error by the guest.
+ * Define a generic error to cover that case, and choose a value that is not
+ * likely to overlap with new explicit error codes should more be added to
+ * the GHCB spec later. KVM will use this to report generic errors when
+ * handling SNP guest requests.
+ */
+#define SNP_GUEST_VMM_ERR_GENERIC       (~0U)
+
 /* enable/disable SEV support */
 static bool sev_enabled = true;
 module_param_named(sev, sev_enabled, bool, 0444);
@@ -4139,6 +4149,36 @@ out_unlock:
 	return ret;
 }
 
+static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error)
+{
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0));
+
+	return 1; /* resume guest */
+}
+
+static int snp_complete_req_certs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) {
+	case 0:
+		return snp_handle_guest_req(svm, control->exit_info_1,
+					    control->exit_info_2);
+	case ENOSPC:
+		vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages;
+		return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN);
+	case EAGAIN:
+		return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY);
+	case EIO:
+		return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC);
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
 {
 	struct kvm *kvm = svm->vcpu.kvm;
@@ -4154,14 +4194,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r
 	/*
 	 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
 	 * additional certificate data to be provided alongside the attestation
-	 * report via the guest-provided data pages indicated by RAX/RBX. The
-	 * certificate data is optional and requires additional KVM enablement
-	 * to provide an interface for userspace to provide it, but KVM still
-	 * needs to be able to handle extended guest requests either way. So
-	 * provide a stub implementation that will always return an empty
-	 * certificate table in the guest-provided data pages.
+	 * report via the guest-provided data pages indicated by RAX/RBX. If
+	 * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace
+	 * to give userspace an opportunity to provide the certificate data
+	 * before issuing/completing the attestation request. Otherwise, return
+	 * an empty certificate table in the guest-provided data pages and
+	 * handle the attestation request immediately.
 	 */
 	if (msg_type == SNP_MSG_REPORT_REQ) {
+		struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
 		struct kvm_vcpu *vcpu = &svm->vcpu;
 		u64 data_npages;
 		gpa_t data_gpa;
@@ -4175,6 +4216,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r
 		if (!PAGE_ALIGNED(data_gpa))
 			goto request_invalid;
 
+		if (sev->snp_certs_enabled) {
+			vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS;
+			vcpu->run->snp_req_certs.gpa = data_gpa;
+			vcpu->run->snp_req_certs.npages = data_npages;
+			vcpu->run->snp_req_certs.ret = 0;
+			vcpu->arch.complete_userspace_io = snp_complete_req_certs;
+			return 0;
+		}
+
 		/*
 		 * As per GHCB spec (see "SNP Extended Guest Request"), the
 		 * certificate table is terminated by 24-bytes of zeroes.
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 338fc4f5cc4c..ebd7b36b1ceb 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -115,6 +115,7 @@ struct kvm_sev_info {
 	void *guest_resp_buf;   /* Bounce buffer for SNP Guest Request output */
 	struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
 	cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
+	bool snp_certs_enabled;	/* SNP certificate-fetching support. */
 };
 
 struct kvm_svm {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dddb781b0507..8cd107cdcf0b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -135,6 +135,12 @@ struct kvm_xen_exit {
 	} u;
 };
 
+struct kvm_exit_snp_req_certs {
+	__u64 gpa;
+	__u64 npages;
+	__u64 ret;
+};
+
 #define KVM_S390_GET_SKEYS_NONE   1
 #define KVM_S390_SKEYS_MAX        1048576
 
@@ -180,6 +186,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_MEMORY_FAULT     39
 #define KVM_EXIT_TDX              40
 #define KVM_EXIT_ARM_SEA          41
+#define KVM_EXIT_SNP_REQ_CERTS    42
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -482,6 +489,8 @@ struct kvm_run {
 			__u64 gva;
 			__u64 gpa;
 		} arm_sea;
+		/* KVM_EXIT_SNP_REQ_CERTS */
+		struct kvm_exit_snp_req_certs snp_req_certs;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3


From 20c3c4108d58f87c711bf44cb0b498b3ac5af6bf Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 9 Jan 2026 17:17:33 -0600
Subject: KVM: SEV: Add KVM_SEV_SNP_ENABLE_REQ_CERTS command

Introduce a new command for KVM_MEMORY_ENCRYPT_OP ioctl that can be used
to enable fetching of endorsement key certificates from userspace via
the new KVM_EXIT_SNP_REQ_CERTS exit type. Also introduce a new
KVM_X86_SEV_SNP_REQ_CERTS KVM device attribute so that userspace can
query whether the kernel supports the new command/exit.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Tested-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Link: https://patch.msgid.link/20260109231732.1160759-3-michael.roth@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../virt/kvm/x86/amd-memory-encryption.rst         | 52 +++++++++++++++++++++-
 arch/x86/include/uapi/asm/kvm.h                    |  2 +
 arch/x86/kvm/svm/sev.c                             | 16 +++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 1ddb6a86ce7f..543b5e5dd8d4 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -572,6 +572,52 @@ Returns: 0 on success, -negative on error
 See SNP_LAUNCH_FINISH in the SEV-SNP specification [snp-fw-abi]_ for further
 details on the input parameters in ``struct kvm_sev_snp_launch_finish``.
 
+21. KVM_SEV_SNP_ENABLE_REQ_CERTS
+--------------------------------
+
+The KVM_SEV_SNP_ENABLE_REQ_CERTS command will configure KVM to exit to
+userspace with a ``KVM_EXIT_SNP_REQ_CERTS`` exit type as part of handling
+a guest attestation report, which will to allow userspace to provide a
+certificate corresponding to the endorsement key used by firmware to sign
+that attestation report.
+
+Returns: 0 on success, -negative on error
+
+NOTE: The endorsement key used by firmware may change as a result of
+management activities like updating SEV-SNP firmware or loading new
+endorsement keys, so some care should be taken to keep the returned
+certificate data in sync with the actual endorsement key in use by
+firmware at the time the attestation request is sent to SNP firmware. The
+recommended scheme to do this is to use file locking (e.g. via fcntl()'s
+F_OFD_SETLK) in the following manner:
+
+  - Prior to obtaining/providing certificate data as part of servicing an
+    exit type of ``KVM_EXIT_SNP_REQ_CERTS``, the VMM should obtain a
+    shared/read or exclusive/write lock on the certificate blob file before
+    reading it and returning it to KVM, and continue to hold the lock until
+    the attestation request is actually sent to firmware. To facilitate
+    this, the VMM can set the ``immediate_exit`` flag of kvm_run just after
+    supplying the certificate data, and just before resuming the vCPU.
+    This will ensure the vCPU will exit again to userspace with ``-EINTR``
+    after it finishes fetching the attestation request from firmware, at
+    which point the VMM can safely drop the file lock.
+
+  - Tools/libraries that perform updates to SNP firmware TCB values or
+    endorsement keys (e.g. via /dev/sev interfaces such as ``SNP_COMMIT``,
+    ``SNP_SET_CONFIG``, or ``SNP_VLEK_LOAD``, see
+    Documentation/virt/coco/sev-guest.rst for more details) in such a way
+    that the certificate blob needs to be updated, should similarly take an
+    exclusive lock on the certificate blob for the duration of any updates
+    to endorsement keys or the certificate blob contents to ensure that
+    VMMs using the above scheme will not return certificate blob data that
+    is out of sync with the endorsement key used by firmware at the time
+    the attestation request is actually issued.
+
+This scheme is recommended so that tools can use a fairly generic/natural
+approach to synchronizing firmware/certificate updates via file-locking,
+which should make it easier to maintain interoperability across
+tools/VMMs/vendors.
+
 Device attribute API
 ====================
 
@@ -579,11 +625,15 @@ Attributes of the SEV implementation can be retrieved through the
 ``KVM_HAS_DEVICE_ATTR`` and ``KVM_GET_DEVICE_ATTR`` ioctls on the ``/dev/kvm``
 device node, using group ``KVM_X86_GRP_SEV``.
 
-Currently only one attribute is implemented:
+The following attributes are currently implemented:
 
 * ``KVM_X86_SEV_VMSA_FEATURES``: return the set of all bits that
   are accepted in the ``vmsa_features`` of ``KVM_SEV_INIT2``.
 
+* ``KVM_X86_SEV_SNP_REQ_CERTS``: return a value of 1 if the kernel supports the
+  ``KVM_EXIT_SNP_REQ_CERTS`` exit, which allows for fetching endorsement key
+  certificates from userspace for each SNP attestation request the guest issues.
+
 Firmware Management
 ===================
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 7ceff6583652..b2c928c5965d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -503,6 +503,7 @@ struct kvm_sync_regs {
 #define KVM_X86_GRP_SEV			1
 #  define KVM_X86_SEV_VMSA_FEATURES	0
 #  define KVM_X86_SNP_POLICY_BITS	1
+#  define KVM_X86_SEV_SNP_REQ_CERTS	2
 
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -743,6 +744,7 @@ enum sev_cmd_id {
 	KVM_SEV_SNP_LAUNCH_START = 100,
 	KVM_SEV_SNP_LAUNCH_UPDATE,
 	KVM_SEV_SNP_LAUNCH_FINISH,
+	KVM_SEV_SNP_ENABLE_REQ_CERTS,
 
 	KVM_SEV_NR_MAX,
 };
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9e6a78e448f2..f9aad5c1447e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2158,6 +2158,9 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
 		*val = snp_supported_policy_bits;
 		return 0;
 
+	case KVM_X86_SEV_SNP_REQ_CERTS:
+		*val = sev_snp_enabled ? 1 : 0;
+		return 0;
 	default:
 		return -ENXIO;
 	}
@@ -2574,6 +2577,16 @@ e_free:
 	return ret;
 }
 
+static int snp_enable_certs(struct kvm *kvm)
+{
+	if (kvm->created_vcpus || !sev_snp_guest(kvm))
+		return -EINVAL;
+
+	to_kvm_sev_info(kvm)->snp_certs_enabled = true;
+
+	return 0;
+}
+
 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -2679,6 +2692,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SNP_LAUNCH_FINISH:
 		r = snp_launch_finish(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SNP_ENABLE_REQ_CERTS:
+		r = snp_enable_certs(kvm);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
-- 
cgit v1.2.3