From 57b8daa70a179bc23cc4240420ab6fbcdd7faf77 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 20 Apr 2018 22:51:11 +1000 Subject: KVM: PPC: Book3S HV: Snapshot timebase offset on guest entry Currently, the HV KVM guest entry/exit code adds the timebase offset from the vcore struct to the timebase on guest entry, and subtracts it on guest exit. Which is fine, except that it is possible for userspace to change the offset using the SET_ONE_REG interface while the vcore is running, as there is only one timebase offset per vcore but potentially multiple VCPUs in the vcore. If that were to happen, KVM would subtract a different offset on guest exit from that which it had added on guest entry, leading to the timebase being out of sync between cores in the host, which then leads to bad things happening such as hangs and spurious watchdog timeouts. To fix this, we add a new field 'tb_offset_applied' to the vcore struct which stores the offset that is currently applied to the timebase. This value is set from the vcore tb_offset field on guest entry, and is what is subtracted from the timebase on guest exit. Since it is zero when the timebase offset is not applied, we can simplify the logic in kvmhv_start_timing and kvmhv_accumulate_time. In addition, we had secondary threads reading the timebase while running concurrently with code on the primary thread which would eventually add or subtract the timebase offset from the timebase. This occurred while saving or restoring the DEC register value on the secondary threads. Although no specific incorrect behaviour has been observed, this is a race which should be fixed. To fix it, we move the DEC saving code to just before we call kvmhv_commence_exit, and the DEC restoring code to after the point where we have waited for the primary thread to switch the MMU context and add the timebase offset. That way we are sure that the timebase contains the guest timebase value in both cases. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_hv.c | 1 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 89 ++++++++++++++++----------------- 4 files changed, 47 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 4c02a7378d06..e7377b73cfec 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -96,6 +96,7 @@ struct kvmppc_vcore { struct kvm_vcpu *runner; struct kvm *kvm; u64 tb_offset; /* guest timebase - host timebase */ + u64 tb_offset_applied; /* timebase offset currently in force */ ulong lpcr; u32 arch_compat; ulong pcr; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6bee65f3cfd3..373dc1d6ef44 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,6 +562,7 @@ int main(void) OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); OFFSET(VCORE_KVM, kvmppc_vcore, kvm); OFFSET(VCORE_TB_OFFSET, kvmppc_vcore, tb_offset); + OFFSET(VCORE_TB_OFFSET_APPL, kvmppc_vcore, tb_offset_applied); OFFSET(VCORE_LPCR, kvmppc_vcore, lpcr); OFFSET(VCORE_PCR, kvmppc_vcore, pcr); OFFSET(VCORE_DPDES, kvmppc_vcore, dpdes); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 4d07fca5121c..9963f65c212b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2441,6 +2441,7 @@ static void init_vcore_to_run(struct kvmppc_vcore *vc) vc->in_guest = 0; vc->napping_threads = 0; vc->conferring_threads = 0; + vc->tb_offset_applied = 0; } static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bd63fa8a08b5..25c32e421b57 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -692,6 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 22: ld r8,VCORE_TB_OFFSET(r5) cmpdi r8,0 beq 37f + std r8, VCORE_TB_OFFSET_APPL(r5) mftb r6 /* current host timebase */ add r8,r8,r6 mtspr SPRN_TBU40,r8 /* update upper 40 bits */ @@ -940,18 +941,6 @@ FTR_SECTION_ELSE ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 8: - /* - * Set the decrementer to the guest decrementer. - */ - ld r8,VCPU_DEC_EXPIRES(r4) - /* r8 is a host timebase value here, convert to guest TB */ - ld r5,HSTATE_KVM_VCORE(r13) - ld r6,VCORE_TB_OFFSET(r5) - add r8,r8,r6 - mftb r7 - subf r3,r7,r8 - mtspr SPRN_DEC,r3 - ld r5, VCPU_SPRG0(r4) ld r6, VCPU_SPRG1(r4) ld r7, VCPU_SPRG2(r4) @@ -1005,6 +994,18 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_LPCR,r8 isync + /* + * Set the decrementer to the guest decrementer. + */ + ld r8,VCPU_DEC_EXPIRES(r4) + /* r8 is a host timebase value here, convert to guest TB */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r6,VCORE_TB_OFFSET_APPL(r5) + add r8,r8,r6 + mftb r7 + subf r3,r7,r8 + mtspr SPRN_DEC,r3 + /* Check if HDEC expires soon */ mfspr r3, SPRN_HDEC EXTEND_HDEC(r3) @@ -1597,8 +1598,27 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) guest_bypass: stw r12, STACK_SLOT_TRAP(r1) - mr r3, r12 + + /* Save DEC */ + /* Do this before kvmhv_commence_exit so we know TB is guest TB */ + ld r3, HSTATE_KVM_VCORE(r13) + mfspr r5,SPRN_DEC + mftb r6 + /* On P9, if the guest has large decr enabled, don't sign extend */ +BEGIN_FTR_SECTION + ld r4, VCORE_LPCR(r3) + andis. r4, r4, LPCR_LD@h + bne 16f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + extsw r5,r5 +16: add r5,r5,r6 + /* r5 is a guest timebase value here, convert to host TB */ + ld r4,VCORE_TB_OFFSET_APPL(r3) + subf r5,r4,r5 + std r5,VCPU_DEC_EXPIRES(r9) + /* Increment exit count, poke other threads to exit */ + mr r3, r12 bl kvmhv_commence_exit nop ld r9, HSTATE_KVM_VCPU(r13) @@ -1639,23 +1659,6 @@ guest_bypass: mtspr SPRN_PURR,r3 mtspr SPRN_SPURR,r4 - /* Save DEC */ - ld r3, HSTATE_KVM_VCORE(r13) - mfspr r5,SPRN_DEC - mftb r6 - /* On P9, if the guest has large decr enabled, don't sign extend */ -BEGIN_FTR_SECTION - ld r4, VCORE_LPCR(r3) - andis. r4, r4, LPCR_LD@h - bne 16f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - extsw r5,r5 -16: add r5,r5,r6 - /* r5 is a guest timebase value here, convert to host TB */ - ld r4,VCORE_TB_OFFSET(r3) - subf r5,r4,r5 - std r5,VCPU_DEC_EXPIRES(r9) - BEGIN_FTR_SECTION b 8f END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) @@ -2017,9 +2020,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 27: /* Subtract timebase offset from timebase */ - ld r8,VCORE_TB_OFFSET(r5) + ld r8, VCORE_TB_OFFSET_APPL(r5) cmpdi r8,0 beq 17f + li r0, 0 + std r0, VCORE_TB_OFFSET_APPL(r5) mftb r6 /* current guest timebase */ subf r8,r8,r6 mtspr SPRN_TBU40,r8 /* update upper 40 bits */ @@ -2700,7 +2705,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) add r3, r3, r5 ld r4, HSTATE_KVM_VCPU(r13) ld r5, HSTATE_KVM_VCORE(r13) - ld r6, VCORE_TB_OFFSET(r5) + ld r6, VCORE_TB_OFFSET_APPL(r5) subf r3, r6, r3 /* convert to host TB value */ std r3, VCPU_DEC_EXPIRES(r4) @@ -2799,7 +2804,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* Restore guest decrementer */ ld r3, VCPU_DEC_EXPIRES(r4) ld r5, HSTATE_KVM_VCORE(r13) - ld r6, VCORE_TB_OFFSET(r5) + ld r6, VCORE_TB_OFFSET_APPL(r5) add r3, r3, r6 /* convert host TB to guest TB value */ mftb r7 subf r3, r7, r3 @@ -3606,12 +3611,9 @@ kvmppc_fix_pmao: */ kvmhv_start_timing: ld r5, HSTATE_KVM_VCORE(r13) - lbz r6, VCORE_IN_GUEST(r5) - cmpwi r6, 0 - beq 5f /* if in guest, need to */ - ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ -5: mftb r5 - subf r5, r6, r5 + ld r6, VCORE_TB_OFFSET_APPL(r5) + mftb r5 + subf r5, r6, r5 /* subtract current timebase offset */ std r3, VCPU_CUR_ACTIVITY(r4) std r5, VCPU_ACTIVITY_START(r4) blr @@ -3622,15 +3624,12 @@ kvmhv_start_timing: */ kvmhv_accumulate_time: ld r5, HSTATE_KVM_VCORE(r13) - lbz r8, VCORE_IN_GUEST(r5) - cmpwi r8, 0 - beq 4f /* if in guest, need to */ - ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ -4: ld r5, VCPU_CUR_ACTIVITY(r4) + ld r8, VCORE_TB_OFFSET_APPL(r5) + ld r5, VCPU_CUR_ACTIVITY(r4) ld r6, VCPU_ACTIVITY_START(r4) std r3, VCPU_CUR_ACTIVITY(r4) mftb r7 - subf r7, r8, r7 + subf r7, r8, r7 /* subtract current timebase offset */ std r7, VCPU_ACTIVITY_START(r4) cmpdi r5, 0 beqlr -- cgit v1.2.3 From e2560b108fb1375b5fab196c1ec0d910bbe8a38b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 9 May 2018 12:20:14 +1000 Subject: KVM: PPC: Book3S HV: Make radix use correct tlbie sequence in kvmppc_radix_tlbie_page The standard eieio ; tlbsync ; ptesync must follow tlbie to ensure it is ordered with respect to subsequent operations. Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index a57eafec4dc2..a6870288c0e0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -162,7 +162,7 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1) : : "r" (addr), "r" (kvm->arch.lpid) : "memory"); - asm volatile("ptesync": : :"memory"); + asm volatile("eieio ; tlbsync ; ptesync": : :"memory"); } static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr) @@ -173,7 +173,7 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr) /* RIC=1 PRS=0 R=1 IS=2 */ asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1) : : "r" (rb), "r" (kvm->arch.lpid) : "memory"); - asm volatile("ptesync": : :"memory"); + asm volatile("eieio ; tlbsync ; ptesync": : :"memory"); } unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, -- cgit v1.2.3 From 7e3d9a1d0f2c681456a2e04b8ba9a2fb448fe515 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 9 May 2018 12:20:15 +1000 Subject: KVM: PPC: Book3S HV: Make radix clear pte when unmapping The current partition table unmap code clears the _PAGE_PRESENT bit out of the pte, which leaves pud_huge/pmd_huge true and does not clear pud_present/pmd_present. This can confuse subsequent page faults and possibly lead to the guest looping doing continual hypervisor page faults. Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index a6870288c0e0..361f42c8c73e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -584,7 +584,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) { - old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, + old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { -- cgit v1.2.3 From 9dc81d6b0f1e3c40bdf97671dd26a24f128e1182 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 May 2018 13:06:42 +1000 Subject: KVM: PPC: Book3S HV: XIVE: Resend re-routed interrupts on CPU priority change When a vcpu priority (CPPR) is set to a lower value (masking more interrupts), we stop processing interrupts already in the queue for the priorities that have now been masked. If those interrupts were previously re-routed to a different CPU, they might still be stuck until the older one that has them in its queue processes them. In the case of guest CPU unplug, that can be never. To address that without creating additional overhead for the normal interrupt processing path, this changes H_CPPR handling so that when such a priority change occurs, we scan the interrupt queue for that vCPU, and for any interrupt in there that has been re-routed, we replace it with a dummy and force a re-trigger. Signed-off-by: Benjamin Herrenschmidt Tested-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_template.c | 108 +++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index c7a5deadd1cc..99c3620b40d9 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -11,6 +11,9 @@ #define XGLUE(a,b) a##b #define GLUE(a,b) XGLUE(a,b) +/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */ +#define XICS_DUMMY 1 + static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) { u8 cppr; @@ -205,6 +208,10 @@ skip_ipi: goto skip_ipi; } + /* If it's the dummy interrupt, continue searching */ + if (hirq == XICS_DUMMY) + goto skip_ipi; + /* If fetching, update queue pointers */ if (scan_type == scan_fetch) { q->idx = idx; @@ -385,9 +392,76 @@ static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc) __x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING); } +static void GLUE(X_PFX,scan_for_rerouted_irqs)(struct kvmppc_xive *xive, + struct kvmppc_xive_vcpu *xc) +{ + unsigned int prio; + + /* For each priority that is now masked */ + for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) { + struct xive_q *q = &xc->queues[prio]; + struct kvmppc_xive_irq_state *state; + struct kvmppc_xive_src_block *sb; + u32 idx, toggle, entry, irq, hw_num; + struct xive_irq_data *xd; + __be32 *qpage; + u16 src; + + idx = q->idx; + toggle = q->toggle; + qpage = READ_ONCE(q->qpage); + if (!qpage) + continue; + + /* For each interrupt in the queue */ + for (;;) { + entry = be32_to_cpup(qpage + idx); + + /* No more ? */ + if ((entry >> 31) == toggle) + break; + irq = entry & 0x7fffffff; + + /* Skip dummies and IPIs */ + if (irq == XICS_DUMMY || irq == XICS_IPI) + goto next; + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) + goto next; + state = &sb->irq_state[src]; + + /* Has it been rerouted ? */ + if (xc->server_num == state->act_server) + goto next; + + /* + * Allright, it *has* been re-routed, kill it from + * the queue. + */ + qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY); + + /* Find the HW interrupt */ + kvmppc_xive_select_irq(state, &hw_num, &xd); + + /* If it's not an LSI, set PQ to 11 the EOI will force a resend */ + if (!(xd->flags & XIVE_IRQ_FLAG_LSI)) + GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_11); + + /* EOI the source */ + GLUE(X_PFX,source_eoi)(hw_num, xd); + + next: + idx = (idx + 1) & q->msk; + if (idx == 0) + toggle ^= 1; + } + } +} + X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; u8 old_cppr; pr_devel("H_CPPR(cppr=%ld)\n", cppr); @@ -407,14 +481,34 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) */ smp_mb(); - /* - * We are masking less, we need to look for pending things - * to deliver and set VP pending bits accordingly to trigger - * a new interrupt otherwise we might miss MFRR changes for - * which we have optimized out sending an IPI signal. - */ - if (cppr > old_cppr) + if (cppr > old_cppr) { + /* + * We are masking less, we need to look for pending things + * to deliver and set VP pending bits accordingly to trigger + * a new interrupt otherwise we might miss MFRR changes for + * which we have optimized out sending an IPI signal. + */ GLUE(X_PFX,push_pending_to_hw)(xc); + } else { + /* + * We are masking more, we need to check the queue for any + * interrupt that has been routed to another CPU, take + * it out (replace it with the dummy) and retrigger it. + * + * This is necessary since those interrupts may otherwise + * never be processed, at least not until this CPU restores + * its CPPR. + * + * This is in theory racy vs. HW adding new interrupts to + * the queue. In practice this works because the interesting + * cases are when the guest has done a set_xive() to move the + * interrupt away, which flushes the xive, followed by the + * target CPU doing a H_CPPR. So any new interrupt coming into + * the queue must still be routed to us and isn't a source + * of concern. + */ + GLUE(X_PFX,scan_for_rerouted_irqs)(xive, xc); + } /* Apply new CPPR */ xc->hw_cppr = cppr; -- cgit v1.2.3 From df158189dbcc2e0ee29dc4b917d45ee5bf25a35e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 17 May 2018 14:47:59 +1000 Subject: KVM: PPC: Book 3S HV: Do ptesync in radix guest exit path A radix guest can execute tlbie instructions to invalidate TLB entries. After a tlbie or a group of tlbies, it must then do the architected sequence eieio; tlbsync; ptesync to ensure that the TLB invalidation has been processed by all CPUs in the system before it can rely on no CPU using any translation that it just invalidated. In fact it is the ptesync which does the actual synchronization in this sequence, and hardware has a requirement that the ptesync must be executed on the same CPU thread as the tlbies which it is expected to order. Thus, if a vCPU gets moved from one physical CPU to another after it has done some tlbies but before it can get to do the ptesync, the ptesync will not have the desired effect when it is executed on the second physical CPU. To fix this, we do a ptesync in the exit path for radix guests. If there are any pending tlbies, this will wait for them to complete. If there aren't, then ptesync will just do the same as sync. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 25c32e421b57..07ca1b2a7966 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1908,6 +1908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) cmpwi cr2, r0, 0 beq cr2, 4f + /* + * Radix: do eieio; tlbsync; ptesync sequence in case we + * interrupted the guest between a tlbie and a ptesync. + */ + eieio + tlbsync + ptesync + /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) lwz r3, VCPU_GUEST_PID(r9) -- cgit v1.2.3 From f4a551b72358facbbe5714248dff78404272feee Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 9 May 2018 16:12:17 +0200 Subject: KVM: s390: vsie: fix < 8k check for the itdba By missing an "L", we might detect some addresses to be <8k, although they are not. e.g. for itdba = 100001fff !(gpa & ~0x1fffU) -> 1 !(gpa & ~0x1fffUL) -> 0 So we would report a SIE validity intercept although everything is fine. Fixes: 166ecb3 ("KVM: s390: vsie: support transactional execution") Reported-by: Dan Carpenter Reviewed-by: Christian Borntraeger Reviewed-by: Janosch Frank Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Janosch Frank Cc: stable@vger.kernel.org # v4.8+ Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 8961e3970901..969882b54266 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -578,7 +578,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) gpa = READ_ONCE(scb_o->itdba) & ~0xffUL; if (gpa && (scb_s->ecb & ECB_TE)) { - if (!(gpa & ~0x1fffU)) { + if (!(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x0080U); goto unpin; } -- cgit v1.2.3 From d8f2f498d9ed0c5010bc1bbc1146f94c8bf9f8cc Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 18 May 2018 16:55:46 +0100 Subject: x86/kvm: fix LAPIC timer drift when guest uses periodic mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 4.10, commit 8003c9ae204e (KVM: LAPIC: add APIC Timer periodic/oneshot mode VMX preemption timer support), guests using periodic LAPIC timers (such as FreeBSD 8.4) would see their timers drift significantly over time. Differences in the underlying clocks and numerical errors means the periods of the two timers (hv and sw) are not the same. This difference will accumulate with every expiry resulting in a large error between the hv and sw timer. This means the sw timer may be running slow when compared to the hv timer. When the timer is switched from hv to sw, the now active sw timer will expire late. The guest VCPU is reentered and it switches to using the hv timer. This timer catches up, injecting multiple IRQs into the guest (of which the guest only sees one as it does not get to run until the hv timer has caught up) and thus the guest's timer rate is low (and becomes increasing slower over time as the sw timer lags further and further behind). I believe a similar problem would occur if the hv timer is the slower one, but I have not observed this. Fix this by synchronizing the deadlines for both timers to the same time source on every tick. This prevents the errors from accumulating. Fixes: 8003c9ae204e21204e49816c5ea629357e283b06 Cc: Wanpeng Li Signed-off-by: David Vrabel Cc: stable@vger.kernel.org Reviewed-by: Paolo Bonzini Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b74c9c1405b9..3773c4625114 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1522,11 +1522,23 @@ static bool set_target_expiration(struct kvm_lapic *apic) static void advance_periodic_target_expiration(struct kvm_lapic *apic) { - apic->lapic_timer.tscdeadline += - nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + ktime_t now = ktime_get(); + u64 tscl = rdtsc(); + ktime_t delta; + + /* + * Synchronize both deadlines to the same time source or + * differences in the periods (caused by differences in the + * underlying clocks or numerical approximation errors) will + * cause the two to drift apart over time as the errors + * accumulate. + */ apic->lapic_timer.target_expiration = ktime_add_ns(apic->lapic_timer.target_expiration, apic->lapic_timer.period); + delta = ktime_sub(apic->lapic_timer.target_expiration, now); + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, delta); } static void start_sw_period(struct kvm_lapic *apic) -- cgit v1.2.3 From c4d2188206bafa177ea58e9a25b952baa0bf7712 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 1 May 2018 09:49:54 -0500 Subject: KVM: x86: Update cpuid properly when CR4.OSXAVE or CR4.PKE is changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPUID bits of OSXSAVE (function=0x1) and OSPKE (func=0x7, leaf=0x0) allows user apps to detect if OS has set CR4.OSXSAVE or CR4.PKE. KVM is supposed to update these CPUID bits when CR4 is updated. Current KVM code doesn't handle some special cases when updates come from emulator. Here is one example: Step 1: guest boots Step 2: guest OS enables XSAVE ==> CR4.OSXSAVE=1 and CPUID.OSXSAVE=1 Step 3: guest hot reboot ==> QEMU reset CR4 to 0, but CPUID.OSXAVE==1 Step 4: guest os checks CPUID.OSXAVE, detects 1, then executes xgetbv Step 4 above will cause an #UD and guest crash because guest OS hasn't turned on OSXAVE yet. This patch solves the problem by comparing the the old_cr4 with cr4. If the related bits have been changed, kvm_update_cpuid() needs to be called. Signed-off-by: Wei Huang Reviewed-by: Bandan Das Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59371de5d722..e3103279eadd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7985,6 +7985,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct msr_data apic_base_msr; int mmu_reset_needed = 0; + int cpuid_update_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; int ret = -EINVAL; @@ -8023,8 +8024,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & + (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + if (cpuid_update_needed) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); -- cgit v1.2.3 From 1eaafe91a0df4157521b6417b3dd8430bf5f52f0 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 9 May 2018 14:29:35 -0700 Subject: kvm: x86: IA32_ARCH_CAPABILITIES is always supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is a possibility that a VM may migrate to a Skylake host, then the hypervisor should report IA32_ARCH_CAPABILITIES.RSBA[bit 2] as being set (future work, of course). This implies that CPUID.(EAX=7,ECX=0):EDX.ARCH_CAPABILITIES[bit 29] should be set. Therefore, kvm should report this CPUID bit as being supported whether or not the host supports it. Userspace is still free to clear the bit if it chooses. For more information on RSBA, see Intel's white paper, "Retpoline: A Branch Target Injection Mitigation" (Document Number 337131-001), currently available at https://bugzilla.kernel.org/show_bug.cgi?id=199511. Since the IA32_ARCH_CAPABILITIES MSR is emulated in kvm, there is no dependency on hardware support for this feature. Signed-off-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Fixes: 28c1c9fabf48 ("KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES") Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b90a8b3..beadfe6e6893 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -495,6 +495,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); } else { entry->ebx = 0; entry->ecx = 0; -- cgit v1.2.3 From 696ca779a928d0e93d61c38ffc3a4d8914a9b9a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 24 May 2018 17:50:56 +0200 Subject: KVM: x86: fix #UD address of failed Hyper-V hypercalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the hypercall was called from userspace or real mode, KVM injects #UD and then advances RIP, so it looks like #UD was caused by the following instruction. This probably won't cause more than confusion, but could give an unexpected access to guest OS' instruction emulator. Also, refactor the code to count hv hypercalls that were handled by the virt userspace. Fixes: 6356ee0c9602 ("x86: Delay skip of emulated hypercall instruction") Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/hyperv.c | 19 +++++++++++-------- arch/x86/kvm/x86.c | 12 ++++-------- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5708e951a5c6..46ff64da44ca 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1260,14 +1260,18 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) } } -static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { - struct kvm_run *run = vcpu->run; - - kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); + kvm_hv_hypercall_set_result(vcpu, result); + ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } +static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); +} + static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) { struct eventfd_ctx *eventfd; @@ -1350,7 +1354,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) /* Hypercall continuation is not supported yet */ if (rep_cnt || rep_idx) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; - goto set_result; + goto out; } switch (code) { @@ -1381,9 +1385,8 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; } -set_result: - kvm_hv_hypercall_set_result(vcpu, ret); - return 1; +out: + return kvm_hv_hypercall_complete(vcpu, ret); } void kvm_hv_init_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3103279eadd..b7618b30b7d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6676,11 +6676,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit; - if (kvm_hv_hypercall_enabled(vcpu->kvm)) { - if (!kvm_hv_hypercall(vcpu)) - return 0; - goto out; - } + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); @@ -6701,7 +6698,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_x86_ops->get_cpl(vcpu) != 0) { ret = -KVM_EPERM; - goto out_error; + goto out; } switch (nr) { @@ -6721,12 +6718,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } -out_error: +out: if (!op_64_bit) ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); -out: ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } -- cgit v1.2.3