summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c28
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c17
-rw-r--r--arch/x86/kvm/lapic.c12
-rw-r--r--arch/x86/kvm/mmu.c22
-rw-r--r--arch/x86/kvm/mmu.h1
-rw-r--r--arch/x86/kvm/page_track.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c9
-rw-r--r--arch/x86/kvm/vmx.c289
-rw-r--r--arch/x86/kvm/x86.c136
12 files changed, 326 insertions, 209 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index afa7bbb596cd..91af75e37306 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -456,7 +456,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
cpuid_mask(&entry->ecx, CPUID_7_ECX);
/* PKU is not yet implemented for shadow paging. */
- if (!tdp_enabled)
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
} else {
entry->ebx = 0;
@@ -765,18 +765,20 @@ out:
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
+ struct kvm_cpuid_entry2 *ej;
+ int j = i;
+ int nent = vcpu->arch.cpuid_nent;
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
- }
- }
- return 0; /* silence gcc, even though control never reaches here */
+ do {
+ j = (j + 1) % nent;
+ ej = &vcpu->arch.cpuid_entries[j];
+ } while (ej->function != e->function);
+
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+ return j;
}
/* find an entry with matching function, matching index (if needed), and that
@@ -846,12 +848,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
if (!best)
best = check_cpuid_limit(vcpu, function, index);
- /*
- * Perfmon not yet supported for L2 guest.
- */
- if (is_guest_mode(vcpu) && function == 0xa)
- best = NULL;
-
if (best) {
*eax = best->eax;
*ebx = best->ebx;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 35058c2c0eea..9368fecca3ee 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
+
static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9f676adcdfc2..72b737b8c9d6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2543,7 +2543,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
u64 smbase;
int ret;
- if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
/*
@@ -2592,11 +2592,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
}
- if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
ctxt->ops->set_nmi_mask(ctxt, false);
- ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
- ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+ ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+ ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
return X86EMUL_CONTINUE;
}
@@ -2738,6 +2738,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
return X86EMUL_CONTINUE;
}
@@ -5312,6 +5313,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
+ unsigned emul_flags;
ctxt->mem_read.pos = 0;
@@ -5326,6 +5328,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
+ emul_flags = ctxt->ops->get_hflags(ctxt);
if (unlikely(ctxt->d &
(No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5359,7 +5362,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(ctxt, &ctxt->dst);
}
- if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5388,7 +5391,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5442,7 +5445,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3f05c044720b..b24b3c6d686e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -246,9 +246,14 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
recalculate_apic_map(apic->vcpu->kvm);
}
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
- u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
@@ -2029,6 +2034,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
{
if (apic_x2apic_mode(vcpu->arch.apic)) {
u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
if (vcpu->kvm->arch.x2apic_format) {
if (*id != vcpu->vcpu_id)
@@ -2039,6 +2045,10 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
else
*id <<= 24;
}
+
+ /* In x2APIC mode, the LDR is fixed and based on the id */
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(*id);
}
return 0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9c7e986b4e4..d29c745f10ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3489,12 +3489,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
+ if (is_guest_mode(vcpu))
+ return false;
+
return kvm_x86_ops->interrupt_allowed(vcpu);
}
@@ -3510,7 +3513,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && can_do_async_pf(vcpu)) {
+ if (!prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
@@ -3646,19 +3649,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
unsigned level, unsigned gpte)
{
/*
- * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
- * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
- * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
- /*
* The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
* If it is clear, there are no large pages at this level, so clear
* PT_PAGE_SIZE_MASK in gpte if that is the case.
*/
gpte &= level - mmu->last_nonleaf_level;
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
return gpte & PT_PAGE_SIZE_MASK;
}
@@ -4166,6 +4169,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
+ update_last_nonleaf_level(vcpu, context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ddc56e91f2e4..c92834c55c59 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,6 +75,7 @@ enum {
int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index b431539c3714..85024e0cfaa5 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -156,6 +156,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
}
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
void kvm_page_track_init(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a01105485315..37363900297d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -324,10 +324,11 @@ retry_walk:
--walker->level;
index = PT_INDEX(addr, walker->level);
-
table_gfn = gpte_to_gfn(pte);
offset = index * sizeof(pt_element_t);
pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 9d4a8504a95a..5ab4a364348e 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ca1eca5038d..23f1a6bd7a0d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2149,6 +2149,8 @@ static int ud_interception(struct vcpu_svm *svm)
int er;
er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -3583,6 +3585,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb->save.g_pat = data;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 64774f419c72..f0d3de153e29 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2167,46 +2167,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
struct pi_desc old, new;
unsigned int dest;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
+ /*
+ * First handle the simple case where no cmpxchg is necessary; just
+ * allow posting non-urgent interrupts.
+ *
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+ * expects the VCPU to be on the blocked_vcpu_list that matches
+ * PI.NDST.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+ vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ return;
+ }
+
+ /* The full case. */
do {
old.control = new.control = pi_desc->control;
- /*
- * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
- * are two possible cases:
- * 1. After running 'pre_block', context switch
- * happened. For this case, 'sn' was set in
- * vmx_vcpu_put(), so we need to clear it here.
- * 2. After running 'pre_block', we were blocked,
- * and woken up by some other guy. For this case,
- * we don't need to do anything, 'pi_post_block'
- * will do everything for us. However, we cannot
- * check whether it is case #1 or case #2 here
- * (maybe, not needed), so we also clear sn here,
- * I think it is not a big deal.
- */
- if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
- if (vcpu->cpu != cpu) {
- dest = cpu_physical_id(cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
- }
+ dest = cpu_physical_id(cpu);
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- }
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
- /* Allow posting non-urgent interrupts */
new.sn = 0;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
}
static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
@@ -2455,7 +2453,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
if (!(vmcs12->exception_bitmap & (1u << nr)))
return 0;
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
return 1;
@@ -2987,7 +2985,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -3069,7 +3068,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
@@ -3693,7 +3696,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
}
vmcs_write16(sf->selector, var.selector);
- vmcs_write32(sf->base, var.base);
+ vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
@@ -4756,21 +4759,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
/*
- * Currently, we don't support urgent interrupt,
- * all interrupts are recognized as non-urgent
- * interrupt, so we cannot post interrupts when
- * 'SN' is set.
+ * The vector of interrupt to be delivered to vcpu had
+ * been set in PIR before this function.
*
- * If the vcpu is in guest mode, it means it is
- * running instead of being scheduled out and
- * waiting in the run queue, and that's the only
- * case when 'SN' is set currently, warning if
- * 'SN' is set.
+ * Following cases will be reached in this block, and
+ * we always send a notification event in all cases as
+ * explained below.
+ *
+ * Case 1: vcpu keeps in non-root mode. Sending a
+ * notification event posts the interrupt to vcpu.
+ *
+ * Case 2: vcpu exits to root mode and is still
+ * runnable. PIR will be synced to vIRR before the
+ * next vcpu entry. Sending a notification event in
+ * this case has no effect, as vcpu is not in root
+ * mode.
+ *
+ * Case 3: vcpu exits to root mode and is blocked.
+ * vcpu_block() has already synced PIR to vIRR and
+ * never blocks vcpu if vIRR is not cleared. Therefore,
+ * a blocked vcpu here does not wait for any requested
+ * interrupts in PIR, and sending a notification event
+ * which has no effect is safe here.
*/
- WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
@@ -5490,6 +5502,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return 1;
}
er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -6474,7 +6488,6 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -6925,14 +6938,20 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
}
page = nested_get_page(vcpu, vmptr);
- if (page == NULL ||
- *(u32 *)kmap(page) != VMCS12_REVISION) {
+ if (page == NULL) {
nested_vmx_failInvalid(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
kunmap(page);
+ nested_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
skip_emulated_instruction(vcpu);
return 1;
}
kunmap(page);
+ nested_release_page_clean(page);
vmx->nested.vmxon_ptr = vmptr;
break;
case EXIT_REASON_VMCLEAR:
@@ -8045,8 +8064,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_TASK_SWITCH:
return true;
case EXIT_REASON_CPUID:
- if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
- return false;
return true;
case EXIT_REASON_HLT:
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
@@ -8131,6 +8148,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
case EXIT_REASON_PREEMPTION_TIMER:
return false;
+ case EXIT_REASON_PML_FULL:
+ /* We don't expose PML support to L1. */
+ return false;
default:
return true;
}
@@ -8202,7 +8222,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
static void vmx_dump_sel(char *name, uint32_t sel)
{
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
- name, vmcs_read32(sel),
+ name, vmcs_read16(sel),
vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
@@ -9176,6 +9196,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+
return &vmx->vcpu;
free_vmcs:
@@ -9985,6 +10012,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
page_to_phys(vmx->nested.virtual_apic_page));
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ } else {
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
}
if (cpu_has_vmx_msr_bitmap() &&
@@ -10067,6 +10099,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
}
+ if (enable_pml) {
+ /*
+ * Conceptually we want to copy the PML address and index from
+ * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+ * since we always flush the log on each vmexit, this happens
+ * to be equivalent to simply resetting the fields in vmcs02.
+ */
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
if (nested_cpu_has_ept(vmcs12)) {
kvm_mmu_unload(vcpu);
nested_ept_init_mmu_context(vcpu);
@@ -10648,7 +10692,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
*/
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
- kvm_set_cr4(vcpu, vmcs12->host_cr4);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
nested_ept_uninit_mmu_context(vcpu);
@@ -10673,6 +10717,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@ -10977,6 +11023,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ do {
+ old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+}
+
/*
* This routine does the following things for vCPU which is going
* to be blocked if VT-d PI is enabled.
@@ -10992,7 +11069,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
*/
static int pi_pre_block(struct kvm_vcpu *vcpu)
{
- unsigned long flags;
unsigned int dest;
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11002,34 +11078,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
!kvm_vcpu_apicv_active(vcpu))
return 0;
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
do {
old.control = new.control = pi_desc->control;
- /*
- * We should not block the vCPU if
- * an interrupt is posted for it.
- */
- if (pi_test_on(pi_desc) == 1) {
- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock_irqrestore(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- vcpu->pre_pcpu = -1;
-
- return 1;
- }
-
WARN((pi_desc->sn == 1),
"Warning: SN field of posted-interrupts "
"is set before blocking\n");
@@ -11051,10 +11113,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
- return 0;
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
}
static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -11070,44 +11137,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
static void pi_post_block(struct kvm_vcpu *vcpu)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
- unsigned long flags;
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (vcpu->pre_pcpu == -1)
return;
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* Allow posting non-urgent interrupts */
- new.sn = 0;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if(vcpu->pre_pcpu != -1) {
- spin_lock_irqsave(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock_irqrestore(
- &per_cpu(blocked_vcpu_on_cpu_lock,
- vcpu->pre_pcpu), flags);
- vcpu->pre_pcpu = -1;
- }
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ __pi_post_block(vcpu);
+ local_irq_enable();
}
static void vmx_post_block(struct kvm_vcpu *vcpu)
@@ -11135,7 +11171,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
- int idx, ret = -EINVAL;
+ int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11144,7 +11180,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
if (e->type != KVM_IRQ_ROUTING_MSI)
@@ -11187,12 +11228,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
if (set)
ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else {
- /* suppress notification event before unposting */
- pi_set_sn(vcpu_to_pi_desc(vcpu));
+ else
ret = irq_set_vcpu_affinity(host_irq, NULL);
- pi_clear_sn(vcpu_to_pi_desc(vcpu));
- }
if (ret < 0) {
printk(KERN_INFO "%s: failed to update PI IRTE\n",
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 731044efb195..02d45296a97c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1735,6 +1735,7 @@ static u64 __get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ u64 ret;
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
@@ -1746,10 +1747,17 @@ static u64 __get_kvmclock_ns(struct kvm *kvm)
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
spin_unlock(&ka->pvclock_gtod_sync_lock);
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- return __pvclock_read_cycles(&hv_clock, rdtsc());
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+
+ put_cpu();
+
+ return ret;
}
u64 get_kvmclock_ns(struct kvm *kvm)
@@ -1789,6 +1797,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
*/
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+ if (guest_hv_clock.version & 1)
+ ++guest_hv_clock.version; /* first time write, random junk */
+
vcpu->hv_clock.version = guest_hv_clock.version + 1;
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
@@ -3051,6 +3062,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
+ /* INITs are latched while in SMM */
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+ (events->smi.smm || events->smi.pending) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ return -EINVAL;
+
process_nmi(vcpu);
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
@@ -3225,11 +3242,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
}
}
+#define XSAVE_MXCSR_OFFSET 24
+
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
@@ -3237,11 +3257,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~kvm_supported_xcr0())
+ if (xstate_bv & ~kvm_supported_xcr0() ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE)
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
@@ -4744,16 +4766,20 @@ emul_write:
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
- /* TODO: String I/O for in kernel device */
- int r;
+ int r = 0, i;
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
+ for (i = 0; i < vcpu->arch.pio.count; i++) {
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ if (r)
+ break;
+ pd += vcpu->arch.pio.size;
+ }
return r;
}
@@ -4791,6 +4817,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (vcpu->arch.pio.count)
goto data_avail;
+ memset(vcpu->arch.pio_data, 0, size * count);
+
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
@@ -4974,6 +5002,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
if (var.unusable) {
memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
return false;
}
@@ -5129,6 +5159,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
}
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+ return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+ kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5168,6 +5208,8 @@ static const struct x86_emulate_ops emulate_ops = {
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
.set_nmi_mask = emulator_set_nmi_mask,
+ .get_hflags = emulator_get_hflags,
+ .set_hflags = emulator_set_hflags,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -5211,6 +5253,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
@@ -5220,7 +5264,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
- ctxt->emul_flags = vcpu->arch.hflags;
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5427,37 +5470,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * rflags is the old, "raw" value of the flags. The new value has
- * not been saved yet.
- *
- * This is correct even for TF set by the guest, because "the
- * processor will not generate this exception after the instruction
- * that sets the TF flag".
- */
- if (unlikely(rflags & X86_EFLAGS_TF)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
- DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5547,6 +5579,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
+ if (ctxt->have_exception && inject_emulated_exception(vcpu))
+ return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu);
@@ -5611,11 +5645,10 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- if (vcpu->arch.hflags != ctxt->emul_flags)
- kvm_set_hflags(vcpu, ctxt->emul_flags);
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ if (r == EMULATE_DONE &&
+ (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
__kvm_set_rflags(vcpu, ctxt->eflags);
@@ -6086,7 +6119,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
+ return emulator_write_emulated(ctxt, rip, instruction, 3,
+ &ctxt->exception);
}
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
@@ -7162,6 +7196,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
return -EINVAL;
+ /* INITs are latched while in SMM */
+ if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+ mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+ return -EINVAL;
+
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
@@ -7976,6 +8016,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_free_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -8412,8 +8453,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
return true;
else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_can_do_async_pf(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)