summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-01-13 09:50:07 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-01-13 09:50:07 -0800
commit0bb933a9fcdee14ef82970caeb8617ad59a11303 (patch)
tree63280e6ec409a0bf98948dc458195878d66de12c
parentafd12f914c8f733a9660c97bdbe539aee88e742b (diff)
parent3611ca7c12b740e250d83f8bbe3554b740c503b0 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull x86 kvm fixes from Paolo Bonzini: - Avoid freeing stack-allocated node in kvm_async_pf_queue_task - Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1 * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1 selftests: kvm: try getting XFD and XSAVE state out of sync selftests: kvm: replace numbered sync points with actions x86/fpu: Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1 x86/kvm: Avoid freeing stack-allocated node in kvm_async_pf_queue_task
-rw-r--r--arch/x86/kernel/fpu/core.c32
-rw-r--r--arch/x86/kernel/kvm.c19
-rw-r--r--arch/x86/kvm/x86.c9
-rw-r--r--tools/testing/selftests/kvm/x86/amx_test.c144
4 files changed, 139 insertions, 65 deletions
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index da233f20ae6f..608983806fd7 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -319,10 +319,29 @@ EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
{
+ struct fpstate *fpstate = guest_fpu->fpstate;
+
fpregs_lock();
- guest_fpu->fpstate->xfd = xfd;
- if (guest_fpu->fpstate->in_use)
- xfd_update_state(guest_fpu->fpstate);
+
+ /*
+ * KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the
+ * save state to its initial configuration. Likewise, KVM_GET_XSAVE does
+ * the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1.
+ *
+ * If the guest's FPU state is in hardware, just update XFD: the XSAVE
+ * in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1.
+ *
+ * If however the guest's FPU state is NOT resident in hardware, clear
+ * disabled components in XSTATE_BV now, or a subsequent XRSTOR will
+ * attempt to load disabled components and generate #NM _in the host_.
+ */
+ if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpstate->regs.xsave.header.xfeatures &= ~xfd;
+
+ fpstate->xfd = xfd;
+ if (fpstate->in_use)
+ xfd_update_state(fpstate);
+
fpregs_unlock();
}
EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
@@ -431,6 +450,13 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
return -EINVAL;
/*
+ * Disabled features must be in their initial state, otherwise XRSTOR
+ * causes an exception.
+ */
+ if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd))
+ return -EINVAL;
+
+ /*
* Nullify @vpkru to preserve its current value if PKRU's bit isn't set
* in the header. KVM's odd ABI is to leave PKRU untouched in this
* case (all other components are eventually re-initialized).
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df78ddee0abb..37dc8465e0f5 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -89,6 +89,7 @@ struct kvm_task_sleep_node {
struct swait_queue_head wq;
u32 token;
int cpu;
+ bool dummy;
};
static struct kvm_task_sleep_head {
@@ -120,15 +121,26 @@ static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
raw_spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
- /* dummy entry exist -> wake up was delivered ahead of PF */
- hlist_del(&e->link);
+ struct kvm_task_sleep_node *dummy = NULL;
+
+ /*
+ * The entry can either be a 'dummy' entry (which is put on the
+ * list when wake-up happens ahead of APF handling completion)
+ * or a token from another task which should not be touched.
+ */
+ if (e->dummy) {
+ hlist_del(&e->link);
+ dummy = e;
+ }
+
raw_spin_unlock(&b->lock);
- kfree(e);
+ kfree(dummy);
return false;
}
n->token = token;
n->cpu = smp_processor_id();
+ n->dummy = false;
init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
raw_spin_unlock(&b->lock);
@@ -231,6 +243,7 @@ again:
}
dummy->token = token;
dummy->cpu = smp_processor_id();
+ dummy->dummy = true;
init_swait_queue_head(&dummy->wq);
hlist_add_head(&dummy->link, &b->list);
dummy = NULL;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff8812f3a129..63afdb6bb078 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5807,9 +5807,18 @@ static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
+ union fpregs_state *xstate = (union fpregs_state *)guest_xsave->region;
+
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+ /*
+ * For backwards compatibility, do not expect disabled features to be in
+ * their initial state. XSTATE_BV[i] must still be cleared whenever
+ * XFD[i]=1, or XRSTOR would cause a #NM.
+ */
+ xstate->xsave.header.xfeatures &= ~vcpu->arch.guest_fpu.fpstate->xfd;
+
return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
guest_xsave->region,
kvm_caps.supported_xcr0,
diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c
index f4ce5a185a7d..37b166260ee3 100644
--- a/tools/testing/selftests/kvm/x86/amx_test.c
+++ b/tools/testing/selftests/kvm/x86/amx_test.c
@@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile)
: : "a"(tile), "d"(0));
}
+static inline int tileloadd_safe(void *tile)
+{
+ return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10",
+ "a"(tile), "d"(0));
+}
+
static inline void __tilerelease(void)
{
asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
@@ -124,27 +130,52 @@ static void set_tilecfg(struct tile_config *cfg)
}
}
+enum {
+ /* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */
+ TEST_SAVE_TILEDATA = 1,
+
+ /* Check TMM0 against tiledata */
+ TEST_COMPARE_TILEDATA = 2,
+
+ /* Restore TMM0 from earlier save */
+ TEST_RESTORE_TILEDATA = 4,
+
+ /* Full VM save/restore */
+ TEST_SAVE_RESTORE = 8,
+};
+
static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
struct tile_data *tiledata,
struct xstate *xstate)
{
+ int vector;
+
GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
this_cpu_has(X86_FEATURE_OSXSAVE));
check_xtile_info();
- GUEST_SYNC(1);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
/* xfd=0, enable amx */
wrmsr(MSR_IA32_XFD, 0);
- GUEST_SYNC(2);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
set_tilecfg(amx_cfg);
__ldtilecfg(amx_cfg);
- GUEST_SYNC(3);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
/* Check save/restore when trap to userspace */
__tileloadd(tiledata);
- GUEST_SYNC(4);
+ GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
+
+ /* xfd=0x40000, disable amx tiledata */
+ wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
+
+ /* host tries setting tiledata while guest XFD is set */
+ GUEST_SYNC(TEST_RESTORE_TILEDATA);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
+
+ wrmsr(MSR_IA32_XFD, 0);
__tilerelease();
- GUEST_SYNC(5);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
/*
* After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
* the xcomp_bv.
@@ -154,6 +185,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);
+ /* #NM test */
+
/* xfd=0x40000, disable amx tiledata */
wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
@@ -166,32 +199,33 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));
- GUEST_SYNC(6);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
set_tilecfg(amx_cfg);
__ldtilecfg(amx_cfg);
- /* Trigger #NM exception */
- __tileloadd(tiledata);
- GUEST_SYNC(10);
- GUEST_DONE();
-}
+ /* Trigger #NM exception */
+ vector = tileloadd_safe(tiledata);
+ __GUEST_ASSERT(vector == NM_VECTOR,
+ "Wanted #NM on tileloadd with XFD[18]=1, got %s",
+ ex_str(vector));
-void guest_nm_handler(struct ex_regs *regs)
-{
- /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
- GUEST_SYNC(7);
GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
- GUEST_SYNC(8);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
/* Clear xfd_err */
wrmsr(MSR_IA32_XFD_ERR, 0);
/* xfd=0, enable amx */
wrmsr(MSR_IA32_XFD, 0);
- GUEST_SYNC(9);
+ GUEST_SYNC(TEST_SAVE_RESTORE);
+
+ __tileloadd(tiledata);
+ GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
+
+ GUEST_DONE();
}
int main(int argc, char *argv[])
@@ -200,10 +234,10 @@ int main(int argc, char *argv[])
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
struct kvm_x86_state *state;
+ struct kvm_x86_state *tile_state = NULL;
int xsave_restore_size;
vm_vaddr_t amx_cfg, tiledata, xstate;
struct ucall uc;
- u32 amx_offset;
int ret;
/*
@@ -228,9 +262,6 @@ int main(int argc, char *argv[])
vcpu_regs_get(vcpu, &regs1);
- /* Register #NM handler */
- vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
-
/* amx cfg for guest_code */
amx_cfg = vm_vaddr_alloc_page(vm);
memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
@@ -244,6 +275,7 @@ int main(int argc, char *argv[])
memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);
+ int iter = 0;
for (;;) {
vcpu_run(vcpu);
TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
@@ -253,37 +285,47 @@ int main(int argc, char *argv[])
REPORT_GUEST_ASSERT(uc);
/* NOT REACHED */
case UCALL_SYNC:
- switch (uc.args[1]) {
- case 1:
- case 2:
- case 3:
- case 5:
- case 6:
- case 7:
- case 8:
- fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
- break;
- case 4:
- case 10:
- fprintf(stderr,
- "GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+ ++iter;
+ if (uc.args[1] & TEST_SAVE_TILEDATA) {
+ fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter);
+ tile_state = vcpu_save_state(vcpu);
+ }
+ if (uc.args[1] & TEST_COMPARE_TILEDATA) {
+ fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter);
/* Compacted mode, get amx offset by xsave area
* size subtract 8K amx size.
*/
- amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
- state = vcpu_save_state(vcpu);
- void *amx_start = (void *)state->xsave + amx_offset;
+ u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+ void *amx_start = (void *)tile_state->xsave + amx_offset;
void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
/* Only check TMM0 register, 1 tile */
ret = memcmp(amx_start, tiles_data, TILE_SIZE);
TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
+ }
+ if (uc.args[1] & TEST_RESTORE_TILEDATA) {
+ fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter);
+ vcpu_xsave_set(vcpu, tile_state->xsave);
+ fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter);
+ }
+ if (uc.args[1] & TEST_SAVE_RESTORE) {
+ fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter);
+ state = vcpu_save_state(vcpu);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vcpu, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ vcpu = vm_recreate_with_one_vcpu(vm);
+ vcpu_load_state(vcpu, state);
kvm_x86_state_cleanup(state);
- break;
- case 9:
- fprintf(stderr,
- "GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
- break;
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vcpu, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
}
break;
case UCALL_DONE:
@@ -293,22 +335,6 @@ int main(int argc, char *argv[])
TEST_FAIL("Unknown ucall %lu", uc.cmd);
}
- state = vcpu_save_state(vcpu);
- memset(&regs1, 0, sizeof(regs1));
- vcpu_regs_get(vcpu, &regs1);
-
- kvm_vm_release(vm);
-
- /* Restore state in a new VM. */
- vcpu = vm_recreate_with_one_vcpu(vm);
- vcpu_load_state(vcpu, state);
- kvm_x86_state_cleanup(state);
-
- memset(&regs2, 0, sizeof(regs2));
- vcpu_regs_get(vcpu, &regs2);
- TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
- "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
- (ulong) regs2.rdi, (ulong) regs2.rsi);
}
done:
kvm_vm_free(vm);