diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-01-13 09:50:07 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-01-13 09:50:07 -0800 |
| commit | 0bb933a9fcdee14ef82970caeb8617ad59a11303 (patch) | |
| tree | 63280e6ec409a0bf98948dc458195878d66de12c | |
| parent | afd12f914c8f733a9660c97bdbe539aee88e742b (diff) | |
| parent | 3611ca7c12b740e250d83f8bbe3554b740c503b0 (diff) | |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull x86 kvm fixes from Paolo Bonzini:
- Avoid freeing stack-allocated node in kvm_async_pf_queue_task
- Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1
selftests: kvm: try getting XFD and XSAVE state out of sync
selftests: kvm: replace numbered sync points with actions
x86/fpu: Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1
x86/kvm: Avoid freeing stack-allocated node in kvm_async_pf_queue_task
| -rw-r--r-- | arch/x86/kernel/fpu/core.c | 32 | ||||
| -rw-r--r-- | arch/x86/kernel/kvm.c | 19 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 9 | ||||
| -rw-r--r-- | tools/testing/selftests/kvm/x86/amx_test.c | 144 |
4 files changed, 139 insertions, 65 deletions
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index da233f20ae6f..608983806fd7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -319,10 +319,29 @@ EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { + struct fpstate *fpstate = guest_fpu->fpstate; + fpregs_lock(); - guest_fpu->fpstate->xfd = xfd; - if (guest_fpu->fpstate->in_use) - xfd_update_state(guest_fpu->fpstate); + + /* + * KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the + * save state to its initial configuration. Likewise, KVM_GET_XSAVE does + * the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1. + * + * If the guest's FPU state is in hardware, just update XFD: the XSAVE + * in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1. + * + * If however the guest's FPU state is NOT resident in hardware, clear + * disabled components in XSTATE_BV now, or a subsequent XRSTOR will + * attempt to load disabled components and generate #NM _in the host_. + */ + if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD)) + fpstate->regs.xsave.header.xfeatures &= ~xfd; + + fpstate->xfd = xfd; + if (fpstate->in_use) + xfd_update_state(fpstate); + fpregs_unlock(); } EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd); @@ -431,6 +450,13 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, return -EINVAL; /* + * Disabled features must be in their initial state, otherwise XRSTOR + * causes an exception. + */ + if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd)) + return -EINVAL; + + /* * Nullify @vpkru to preserve its current value if PKRU's bit isn't set * in the header. KVM's odd ABI is to leave PKRU untouched in this * case (all other components are eventually re-initialized). diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index df78ddee0abb..37dc8465e0f5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -89,6 +89,7 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; + bool dummy; }; static struct kvm_task_sleep_head { @@ -120,15 +121,26 @@ static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { - /* dummy entry exist -> wake up was delivered ahead of PF */ - hlist_del(&e->link); + struct kvm_task_sleep_node *dummy = NULL; + + /* + * The entry can either be a 'dummy' entry (which is put on the + * list when wake-up happens ahead of APF handling completion) + * or a token from another task which should not be touched. + */ + if (e->dummy) { + hlist_del(&e->link); + dummy = e; + } + raw_spin_unlock(&b->lock); - kfree(e); + kfree(dummy); return false; } n->token = token; n->cpu = smp_processor_id(); + n->dummy = false; init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); @@ -231,6 +243,7 @@ again: } dummy->token = token; dummy->cpu = smp_processor_id(); + dummy->dummy = true; init_swait_queue_head(&dummy->wq); hlist_add_head(&dummy->link, &b->list); dummy = NULL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff8812f3a129..63afdb6bb078 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5807,9 +5807,18 @@ static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { + union fpregs_state *xstate = (union fpregs_state *)guest_xsave->region; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + /* + * For backwards compatibility, do not expect disabled features to be in + * their initial state. XSTATE_BV[i] must still be cleared whenever + * XFD[i]=1, or XRSTOR would cause a #NM. + */ + xstate->xsave.header.xfeatures &= ~vcpu->arch.guest_fpu.fpstate->xfd; + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, kvm_caps.supported_xcr0, diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index f4ce5a185a7d..37b166260ee3 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile) : : "a"(tile), "d"(0)); } +static inline int tileloadd_safe(void *tile) +{ + return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10", + "a"(tile), "d"(0)); +} + static inline void __tilerelease(void) { asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::); @@ -124,27 +130,52 @@ static void set_tilecfg(struct tile_config *cfg) } } +enum { + /* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */ + TEST_SAVE_TILEDATA = 1, + + /* Check TMM0 against tiledata */ + TEST_COMPARE_TILEDATA = 2, + + /* Restore TMM0 from earlier save */ + TEST_RESTORE_TILEDATA = 4, + + /* Full VM save/restore */ + TEST_SAVE_RESTORE = 8, +}; + static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, struct tile_data *tiledata, struct xstate *xstate) { + int vector; + GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) && this_cpu_has(X86_FEATURE_OSXSAVE)); check_xtile_info(); - GUEST_SYNC(1); + GUEST_SYNC(TEST_SAVE_RESTORE); /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); - GUEST_SYNC(2); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); - GUEST_SYNC(3); + GUEST_SYNC(TEST_SAVE_RESTORE); /* Check save/restore when trap to userspace */ __tileloadd(tiledata); - GUEST_SYNC(4); + GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + + /* xfd=0x40000, disable amx tiledata */ + wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); + + /* host tries setting tiledata while guest XFD is set */ + GUEST_SYNC(TEST_RESTORE_TILEDATA); + GUEST_SYNC(TEST_SAVE_RESTORE); + + wrmsr(MSR_IA32_XFD, 0); __tilerelease(); - GUEST_SYNC(5); + GUEST_SYNC(TEST_SAVE_RESTORE); /* * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in * the xcomp_bv. @@ -154,6 +185,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA); + /* #NM test */ + /* xfd=0x40000, disable amx tiledata */ wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); @@ -166,32 +199,33 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA)); - GUEST_SYNC(6); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); - /* Trigger #NM exception */ - __tileloadd(tiledata); - GUEST_SYNC(10); - GUEST_DONE(); -} + /* Trigger #NM exception */ + vector = tileloadd_safe(tiledata); + __GUEST_ASSERT(vector == NM_VECTOR, + "Wanted #NM on tileloadd with XFD[18]=1, got %s", + ex_str(vector)); -void guest_nm_handler(struct ex_regs *regs) -{ - /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ - GUEST_SYNC(7); GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); - GUEST_SYNC(8); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); /* Clear xfd_err */ wrmsr(MSR_IA32_XFD_ERR, 0); /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); - GUEST_SYNC(9); + GUEST_SYNC(TEST_SAVE_RESTORE); + + __tileloadd(tiledata); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -200,10 +234,10 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_x86_state *state; + struct kvm_x86_state *tile_state = NULL; int xsave_restore_size; vm_vaddr_t amx_cfg, tiledata, xstate; struct ucall uc; - u32 amx_offset; int ret; /* @@ -228,9 +262,6 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); - /* Register #NM handler */ - vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); - /* amx cfg for guest_code */ amx_cfg = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize()); @@ -244,6 +275,7 @@ int main(int argc, char *argv[]) memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate); + int iter = 0; for (;;) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -253,37 +285,47 @@ int main(int argc, char *argv[]) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: - switch (uc.args[1]) { - case 1: - case 2: - case 3: - case 5: - case 6: - case 7: - case 8: - fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]); - break; - case 4: - case 10: - fprintf(stderr, - "GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]); + ++iter; + if (uc.args[1] & TEST_SAVE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter); + tile_state = vcpu_save_state(vcpu); + } + if (uc.args[1] & TEST_COMPARE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter); /* Compacted mode, get amx offset by xsave area * size subtract 8K amx size. */ - amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; - state = vcpu_save_state(vcpu); - void *amx_start = (void *)state->xsave + amx_offset; + u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; + void *amx_start = (void *)tile_state->xsave + amx_offset; void *tiles_data = (void *)addr_gva2hva(vm, tiledata); /* Only check TMM0 register, 1 tile */ ret = memcmp(amx_start, tiles_data, TILE_SIZE); TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); + } + if (uc.args[1] & TEST_RESTORE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter); + vcpu_xsave_set(vcpu, tile_state->xsave); + fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter); + } + if (uc.args[1] & TEST_SAVE_RESTORE) { + fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter); + state = vcpu_save_state(vcpu); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vcpu, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); kvm_x86_state_cleanup(state); - break; - case 9: - fprintf(stderr, - "GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]); - break; + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vcpu, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); } break; case UCALL_DONE: @@ -293,22 +335,6 @@ int main(int argc, char *argv[]) TEST_FAIL("Unknown ucall %lu", uc.cmd); } - state = vcpu_save_state(vcpu); - memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vcpu, ®s1); - - kvm_vm_release(vm); - - /* Restore state in a new VM. */ - vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_load_state(vcpu, state); - kvm_x86_state_cleanup(state); - - memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vcpu, ®s2); - TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", - (ulong) regs2.rdi, (ulong) regs2.rsi); } done: kvm_vm_free(vm); |
