diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 14 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/special_insns.h | 16 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 37 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 63 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 102 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 42 | ||||
-rw-r--r-- | arch/x86/kvm/page_track.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 30 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 63 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 24 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 3 |
15 files changed, 350 insertions, 80 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 01c8b501cb6d..f62a9f37f79f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -84,7 +84,8 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ + | X86_CR4_PKE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -187,12 +188,14 @@ enum { #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 +#define PFERR_PK_BIT 5 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) #define PFERR_USER_MASK (1U << PFERR_USER_BIT) #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) +#define PFERR_PK_MASK (1U << PFERR_PK_BIT) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -335,6 +338,14 @@ struct kvm_mmu { */ u8 permissions[16]; + /* + * The pkru_mask indicates if protection key checks are needed. It + * consists of 16 domains indexed by page fault error code bits [4:1], + * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. + * Each domain has 2 bits which are ANDed with AD and WD from PKRU. + */ + u32 pkru_mask; + u64 *pae_root; u64 *lm_root; @@ -874,6 +885,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1ff49ec29ece..97f3242e133c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -107,6 +107,12 @@ static inline u32 read_pkru(void) return 0; } +static inline void write_pkru(u32 pkru) +{ + if (boot_cpu_has(X86_FEATURE_OSPKE)) + __write_pkru(pkru); +} + static inline int pte_young(pte_t pte) { return pte_flags(pte) & _PAGE_ACCESSED; diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aee6e76e561e..d96d04377765 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -113,11 +113,27 @@ static inline u32 __read_pkru(void) : "c" (ecx)); return pkru; } + +static inline void __write_pkru(u32 pkru) +{ + u32 ecx = 0, edx = 0; + + /* + * "wrpkru" instruction. Loads contents in EAX to PKRU, + * requires that ecx = edx = 0. + */ + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (pkru), "c"(ecx), "d"(edx)); +} #else static inline u32 __read_pkru(void) { return 0; } + +static inline void __write_pkru(u32 pkru) +{ +} #endif static inline void native_wbinvd(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 47190bd399e7..807950860fb7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,7 @@ #include <linux/kprobes.h> #include <linux/debugfs.h> #include <linux/nmi.h> +#include <linux/swait.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -91,14 +92,14 @@ static void kvm_io_delay(void) struct kvm_task_sleep_node { struct hlist_node link; - wait_queue_head_t wq; + struct swait_queue_head wq; u32 token; int cpu; bool halted; }; static struct kvm_task_sleep_head { - spinlock_t lock; + raw_spinlock_t lock; struct hlist_head list; } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token) u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; - DEFINE_WAIT(wait); + DECLARE_SWAITQUEUE(wait); rcu_irq_enter(); - spin_lock(&b->lock); + raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); kfree(e); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); rcu_irq_exit(); return; @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); n.halted = is_idle_task(current) || preempt_count() > 1; - init_waitqueue_head(&n.wq); + init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); for (;;) { if (!n.halted) - prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token) } } if (!n.halted) - finish_wait(&n.wq, &wait); + finish_swait(&n.wq, &wait); rcu_irq_exit(); return; @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (waitqueue_active(&n->wq)) - wake_up(&n->wq); + else if (swait_active(&n->wq)) + swake_up(&n->wq); } static void apf_task_wake_all(void) @@ -189,14 +190,14 @@ static void apf_task_wake_all(void) for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; - spin_lock(&b->lock); + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { struct kvm_task_sleep_node *n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); } } @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token) } again: - spin_lock(&b->lock); + raw_spin_lock(&b->lock); n = _find_apf_task(b, token); if (!n) { /* @@ -225,17 +226,17 @@ again: * Allocation failed! Busy wait while other cpu * handles async PF. */ - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); cpu_relax(); goto again; } n->token = token; n->cpu = smp_processor_id(); - init_waitqueue_head(&n->wq); + init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else apf_task_wake_one(n); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); @@ -486,7 +487,7 @@ void __init kvm_guest_init(void) paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) - spin_lock_init(&async_pf_sleepers[i].lock); + raw_spin_lock_init(&async_pf_sleepers[i].lock); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0029644bf09c..8efb839948e5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 7, 0); + if (best) { + /* Update OSPKE bit */ + if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) { + best->ecx &= ~F(OSPKE); + if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) + best->ecx |= F(OSPKE); + } + } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; @@ -305,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = + const u32 kvm_cpuid_1_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | @@ -315,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = + const u32 kvm_cpuid_8000_0001_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | @@ -325,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = + const u32 kvm_cpuid_1_ecx_x86_features = /* NOTE: MONITOR (and MWAIT) are emulated as NOP, * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | @@ -337,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = + const u32 kvm_cpuid_8000_0001_ecx_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = + const u32 kvm_cpuid_C000_0001_edx_x86_features = F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = + const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ - const u32 kvm_supported_word10_x86_features = + const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -376,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, (u32)0xd); break; case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); + entry->edx &= kvm_cpuid_1_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_1_EDX); + entry->ecx &= kvm_cpuid_1_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_1_ECX); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -433,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); // TSC_ADJUST is emulated entry->ebx |= F(TSC_ADJUST); - } else + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled) + entry->ecx &= ~F(PKU); + } else { entry->ebx = 0; + entry->ecx = 0; + } entry->eax = 0; - entry->ecx = 0; entry->edx = 0; break; } @@ -514,7 +533,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, do_cpuid_1_ent(&entry[i], function, idx); if (idx == 1) { - entry[i].eax &= kvm_supported_word10_x86_features; + entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; entry[i].ebx = 0; if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) entry[i].ebx = @@ -564,10 +583,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); + entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_8000_0001_EDX); + entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -600,8 +619,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); + entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 66a6581724ad..e17a74b1d852 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ecx & bit(X86_FEATURE_PKU)); +} + static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index e1e89ee4af75..762cdf2595f9 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } +static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_pkru(vcpu); +} + static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c512f095cdac..6bdfbc23ecaa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); - vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ - smp_mb(); + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) @@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) * reads to sptes. If it does, kvm_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ - smp_mb(); - vcpu->mode = OUTSIDE_GUEST_MODE; + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } @@ -2390,14 +2389,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, return; /* - * wmb: make sure everyone sees our modifications to the page tables - * rmb: make sure we see changes to vcpu->mode - */ - smp_mb(); - - /* - * Wait for all vcpus to exit guest mode and/or lockless shadow - * page table walks. + * We need to make sure everyone sees our modifications to + * the page tables and see changes to vcpu->mode here. The barrier + * in the kvm_flush_remote_tlbs() achieves this. This pairs + * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. + * + * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit + * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); @@ -3923,6 +3921,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } +/* +* PKU is an additional mechanism by which the paging controls access to +* user-mode addresses based on the value in the PKRU register. Protection +* key violations are reported through a bit in the page fault error code. +* Unlike other bits of the error code, the PK bit is not known at the +* call site of e.g. gva_to_gpa; it must be computed directly in +* permission_fault based on two bits of PKRU, on some machine state (CR4, +* CR0, EFER, CPL), and on other bits of the error code and the page tables. +* +* In particular the following conditions come from the error code, the +* page tables and the machine state: +* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 +* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) +* - PK is always zero if U=0 in the page tables +* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. +* +* The PKRU bitmask caches the result of these four conditions. The error +* code (minus the P bit) and the page table's U bit form an index into the +* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed +* with the two bits of the PKRU register corresponding to the protection key. +* For the first three conditions above the bits will be 00, thus masking +* away both AD and WD. For all reads or if the last condition holds, WD +* only will be masked away. +*/ +static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept) +{ + unsigned bit; + bool wp; + + if (ept) { + mmu->pkru_mask = 0; + return; + } + + /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { + mmu->pkru_mask = 0; + return; + } + + wp = is_write_protection(vcpu); + + for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { + unsigned pfec, pkey_bits; + bool check_pkey, check_write, ff, uf, wf, pte_user; + + pfec = bit << 1; + ff = pfec & PFERR_FETCH_MASK; + uf = pfec & PFERR_USER_MASK; + wf = pfec & PFERR_WRITE_MASK; + + /* PFEC.RSVD is replaced by ACC_USER_MASK. */ + pte_user = pfec & PFERR_RSVD_MASK; + + /* + * Only need to check the access which is not an + * instruction fetch and is to a user page. + */ + check_pkey = (!ff && pte_user); + /* + * write access is controlled by PKRU if it is a + * user access or CR0.WP = 1. + */ + check_write = check_pkey && wf && (uf || wp); + + /* PKRU.AD stops both read and write access. */ + pkey_bits = !!check_pkey; + /* PKRU.WD stops write access. */ + pkey_bits |= (!!check_write) << 1; + + mmu->pkru_mask |= (pkey_bits & 3) << pfec; + } +} + static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { unsigned root_level = mmu->root_level; @@ -3941,6 +4014,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); @@ -3968,6 +4042,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; @@ -4026,6 +4101,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4078,6 +4154,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); + update_pkru_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4132,6 +4209,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); + update_pkru_bitmask(vcpu, g_context, false); update_last_nonleaf_level(vcpu, g_context); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 58fe98a0a526..b70df72e2b33 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -10,10 +10,11 @@ #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) #define PT_WRITABLE_SHIFT 1 +#define PT_USER_SHIFT 2 #define PT_PRESENT_MASK (1ULL << 0) #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) +#define PT_USER_MASK (1ULL << PT_USER_SHIFT) #define PT_PWT_MASK (1ULL << 3) #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 @@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) } /* - * Will a fault with a given page-fault error code (pfec) cause a permission - * fault with the given access (in ACC_* format)? + * Check if a given access (described through the I/D, W/R and U/S bits of a + * page fault error code pfec) causes a permission fault with the given PTE + * access rights (in ACC_* format). + * + * Return zero if the access does not fault; return the page fault error code + * if the access faults. */ -static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - unsigned pte_access, unsigned pfec) +static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pte_pkey, + unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -166,10 +172,32 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + bool fault = (mmu->permissions[index] >> pte_access) & 1; + + WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); + pfec |= PFERR_PRESENT_MASK; + + if (unlikely(mmu->pkru_mask)) { + u32 pkru_bits, offset; + + /* + * PKRU defines 32 bits, there are 16 domains and 2 + * attribute bits per domain in pkru. pte_pkey is the + * index of the protection domain, so pte_pkey * 2 is + * is the index of the first bit for the domain. + */ + pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + + /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ + offset = pfec - 1 + + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); - WARN_ON(pfec & PFERR_RSVD_MASK); + pkru_bits &= mmu->pkru_mask >> offset; + pfec |= -pkru_bits & PFERR_PK_MASK; + fault |= (pkru_bits != 0); + } - return (mmu->permissions[index] >> pte_access) & 1; + return -(uint32_t)fault & pfec; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 11f76436f74f..b431539c3714 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + struct kvm_memory_slot *slot; + int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot) + return false; + + index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e159a8185ad9..1d971c7553c3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, return 0; } +static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned pkeys = 0; +#if PTTYPE == 64 + pte_t pte = {.pte = gpte}; + + pkeys = pte_flags_pkey(pte_flags(pte)); +#endif + return pkeys; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty; + unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -359,10 +370,10 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { - errcode |= PFERR_PRESENT_MASK; + pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); + errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); + if (unlikely(errcode)) goto error; - } gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; @@ -949,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + /* + * Update spte before increasing tlbs_dirty to make + * sure no tlb flush is lost after spte is zapped; see + * the comments in kvm_flush_remote_tlbs(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } @@ -964,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); + /* + * The same as above where we are doing + * prefetch_invalid_gpte(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 95070386d599..31346a3f20a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static u32 svm_get_pkru(struct kvm_vcpu *vcpu) +{ + return 0; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + + .get_pkru = svm_get_pkru, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 75173efccac5..efc243e4dabf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -598,6 +598,10 @@ struct vcpu_vmx { struct page *pml_pg; u64 current_tsc_ratio; + + bool guest_pkru_valid; + u32 guest_pkru; + u32 host_pkru; }; enum segment_cache_field { @@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); } + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -2286,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->guest_pkru; +} + static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -2712,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -3886,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!enable_unrestricted_guest && !is_paging(vcpu)) /* - * SMEP/SMAP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode without - * unrestricted guest. - * To emulate this behavior, SMEP/SMAP needs to be manually - * disabled when guest switches to non-paging mode. + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs + * to be manually disabled when guest switches to non-paging + * mode. + * + * If !enable_unrestricted_guest, the CPU is always running + * with CR0.PG=1 and CR4 needs to be modified. + * If enable_unrestricted_guest, the CPU automatically + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -7399,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } @@ -7457,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } @@ -7473,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + /* + * Old versions of KVM use the single-context version so we + * have to support it; just treat it the same as all-context. + */ case VMX_VPID_EXTENT_ALL_CONTEXT: __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: - /* Trap single context invalidation invvpid calls */ + /* Trap individual address invalidation invvpid calls */ BUG_ON(1); break; } @@ -8621,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + if (vmx->guest_pkru_valid) + __write_pkru(vmx->guest_pkru); + atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -8761,6 +8793,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); /* + * eager fpu is enabled if PKEY is supported and CR4 is switched + * back on host, so it is safe to read guest PKRU from current + * XSAVE. + */ + if (boot_cpu_has(X86_FEATURE_OSPKE)) { + vmx->guest_pkru = __read_pkru(); + if (vmx->guest_pkru != vmx->host_pkru) { + vmx->guest_pkru_valid = true; + __write_pkru(vmx->host_pkru); + } else + vmx->guest_pkru_valid = false; + } + + /* * the KVM_REQ_EVENT optimization bit is only on for one entry, and if * we did not inject a still-pending event to L1 now because of * nested_run_pending, we need to re-enable this bit. @@ -10884,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + + .get_pkru = vmx_get_pkru, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7236bd3a4c3d..e260ccbc8f55 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP; + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) return 1; @@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; + if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); return 0; @@ -4326,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); + /* + * currently PKRU is only applied to ept enabled guest so + * there is no pkey in EPT page table for L1 guest or EPT + * shadow page table for L2 guest. + */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, access)) { + vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -6588,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - /* We should set ->mode before check ->requests, - * see the comment in make_all_cpus_request. + /* + * We should set ->mode before check ->requests, + * Please see the comment in kvm_make_all_cpus_request. + * This also orders the write to mode from any reads + * to the page tables done while the VCPU is running. + * Please see the comment in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); @@ -7123,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) + if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 007940faa5c6..7ce3634ab5fe 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ - | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512) + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); |