From 510043da8582ad49d22a1e9a6b211e6ede10cd2e Mon Sep 17 00:00:00 2001 From: Dor Laor Date: Mon, 19 Feb 2007 18:25:43 +0200 Subject: KVM: Use the generic skip_emulated_instruction() in hypercall code Instead of twiddling the rip registers directly, use the skip_emulated_instruction() function to do that for us. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index fbbf9d6b299f..a721b60f7385 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1658,7 +1658,7 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); + skip_emulated_instruction(vcpu); return kvm_hypercall(vcpu, kvm_run); } -- cgit v1.2.3 From 46fc1477887c41c8e900f2c95485e222b9a54822 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 22 Feb 2007 19:39:30 +0200 Subject: KVM: Do not communicate to userspace through cpu registers during PIO Currently when passing the a PIO emulation request to userspace, we rely on userspace updating %rax (on 'in' instructions) and %rsi/%rdi/%rcx (on string instructions). This (a) requires two extra ioctls for getting and setting the registers and (b) is unfriendly to non-x86 archs, when they get kvm ports. So fix by doing the register fixups in the kernel and passing to userspace only an abstract description of the PIO to be done. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a721b60f7385..4d5f40fcb651 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1459,12 +1459,14 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; kvm_run->io.rep = (exit_qualification & 32) != 0; kvm_run->io.port = exit_qualification >> 16; + kvm_run->io.count = 1; if (kvm_run->io.string) { if (!get_io_count(vcpu, &kvm_run->io.count)) return 1; kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); } else kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ + vcpu->pio_pending = 1; return 0; } -- cgit v1.2.3 From 06465c5a3aa9948a7b00af49cd22ed8f235cdb0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 28 Feb 2007 20:46:53 +0200 Subject: KVM: Handle cpuid in the kernel instead of punting to userspace KVM used to handle cpuid by letting userspace decide what values to return to the guest. We now handle cpuid completely in the kernel. We still let userspace decide which values the guest will see by having userspace set up the value table beforehand (this is necessary to allow management software to set the cpu features to the least common denominator, so that live migration can work). The motivation for the change is that kvm kernel code can be impacted by cpuid features, for example the x86 emulator. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 4d5f40fcb651..71410a65bb90 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1585,8 +1585,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - kvm_run->exit_reason = KVM_EXIT_CPUID; - return 0; + kvm_emulate_cpuid(vcpu); + return 1; } static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -- cgit v1.2.3 From 8eb7d334bd8e693340ee198280f7d45035cdab8c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Mar 2007 14:17:08 +0200 Subject: KVM: Fold kvm_run::exit_type into kvm_run::exit_reason Currently, userspace is told about the nature of the last exit from the guest using two fields, exit_type and exit_reason, where exit_type has just two enumerations (and no need for more). So fold exit_type into exit_reason, reducing the complexity of determining what really happened. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 71410a65bb90..cf9568fbe8a5 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1922,10 +1922,10 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - kvm_run->exit_type = 0; if (fail) { - kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; - kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; } else { /* @@ -1935,7 +1935,6 @@ again: profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); vcpu->launched = 1; - kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; r = kvm_handle_exit(kvm_run, vcpu); if (r > 0) { /* Give scheduler a change to reschedule. */ -- cgit v1.2.3 From 1b19f3e61d7e1edb395dd64bf7d63621a37af8ca Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Mar 2007 14:24:03 +0200 Subject: KVM: Add a special exit reason when exiting due to an interrupt This is redundant, as we also return -EINTR from the ioctl, but it allows us to examine the exit_reason field on resume without seeing old data. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index cf9568fbe8a5..e69bab6d811d 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1941,12 +1941,14 @@ again: if (signal_pending(current)) { ++kvm_stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { ++kvm_stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } -- cgit v1.2.3 From 039576c03c35e2f990ad9bb9c39e1bad3cd60d34 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 12:46:50 +0200 Subject: KVM: Avoid guest virtual addresses in string pio userspace interface The current string pio interface communicates using guest virtual addresses, relying on userspace to translate addresses and to check permissions. This interface cannot fully support guest smp, as the check needs to take into account two pages at one in case an unaligned string transfer straddles a page boundary. Change the interface not to communicate guest addresses at all; instead use a buffer page (mmaped by userspace) and do transfers there. The kernel manages the virtual to physical translation and can perform the checks atomically by taking the appropriate locks. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index e69bab6d811d..0d9bf0b36d37 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1394,7 +1394,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) +static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count) { u64 inst; gva_t rip; @@ -1439,35 +1439,35 @@ static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) done: countr_size *= 8; *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); + //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]); return 1; } static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; + int size, down, in, string, rep; + unsigned port; + unsigned long count; + gva_t address; ++kvm_stat.io_exits; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - kvm_run->exit_reason = KVM_EXIT_IO; - if (exit_qualification & 8) - kvm_run->io.direction = KVM_EXIT_IO_IN; - else - kvm_run->io.direction = KVM_EXIT_IO_OUT; - kvm_run->io.size = (exit_qualification & 7) + 1; - kvm_run->io.string = (exit_qualification & 16) != 0; - kvm_run->io.string_down - = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - kvm_run->io.rep = (exit_qualification & 32) != 0; - kvm_run->io.port = exit_qualification >> 16; - kvm_run->io.count = 1; - if (kvm_run->io.string) { - if (!get_io_count(vcpu, &kvm_run->io.count)) + in = (exit_qualification & 8) != 0; + size = (exit_qualification & 7) + 1; + string = (exit_qualification & 16) != 0; + down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + count = 1; + rep = (exit_qualification & 32) != 0; + port = exit_qualification >> 16; + address = 0; + if (string) { + if (rep && !get_io_count(vcpu, &count)) return 1; - kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); - } else - kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ - vcpu->pio_pending = 1; - return 0; + address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } + return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, + address, rep, port); } static void -- cgit v1.2.3 From 8cb5b0333250beb382624f626851a31f601b4830 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 18:40:40 +0200 Subject: KVM: Workaround vmx inability to virtualize the reset state The reset state has cs.selector == 0xf000 and cs.base == 0xffff0000, which aren't compatible with vm86 mode, which is used for real mode virtualization. When we create a vcpu, we set cs.base to 0xf0000, but if we get there by way of a reset, the values are inconsistent and vmx refuses to enter guest mode. Workaround by detecting the state and munging it appropriately. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 0d9bf0b36d37..aa7e2ba6fb5a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -712,6 +712,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); -- cgit v1.2.3 From f6528b03f167785301908bf124db7be591e983ca Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 18:44:51 +0200 Subject: KVM: Remove set_cr0_no_modeswitch() arch op set_cr0_no_modeswitch() was a hack to avoid corrupting segment registers. As we now cache the protected mode values on entry to real mode, this isn't an issue anymore, and it interferes with reboot (which usually _is_ a modeswitch). Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index aa7e2ba6fb5a..027a9625ef90 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -788,22 +788,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->cr0 = cr0; } -/* - * Used when restoring the VM to avoid corrupting segment registers - */ -static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) - enter_rmode(vcpu); - - vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); - update_exception_bitmap(vcpu); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->cr0 = cr0; -} - static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); @@ -2069,7 +2053,6 @@ static struct kvm_arch_ops vmx_arch_ops = { .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, .set_cr0 = vmx_set_cr0, - .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 038881c8bec0e9a796d1782c56e29e7c2456626d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 Mar 2007 17:58:32 +0200 Subject: KVM: Hack real-mode segments on vmx from KVM_SET_SREGS As usual, we need to mangle segment registers when emulating real mode as vm86 has specific constraints. We special case the reset segment base, and set the "access rights" (or descriptor flags) to vm86 comaptible values. This fixes reboot on vmx. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 027a9625ef90..578dff5424e3 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -864,7 +864,14 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (var->unusable) + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; -- cgit v1.2.3 From afeb1f14c5478560262b37431726eb0eb1a42e9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 27 Mar 2007 17:50:20 +0200 Subject: KVM: Remove debug message No longer interesting. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 578dff5424e3..b64b7b792e84 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1131,7 +1131,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->guest_msrs[j] = vcpu->host_msrs[j]; ++vcpu->nmsrs; } - printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, -- cgit v1.2.3 From 954bbbc236afe23b368abdf4942f313a5f6e1d50 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 30 Mar 2007 14:02:32 +0300 Subject: KVM: Simply gfn_to_page() Mapping a guest page to a host page is a common operation. Currently, one has first to find the memory slot where the page belongs (gfn_to_memslot), then locate the page itself (gfn_to_page()). This is clumsy, and also won't work well with memory aliases. So simplify gfn_to_page() not to require memory slot translation first, and instead do it internally. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b64b7b792e84..61a611691e50 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -926,9 +926,9 @@ static int init_rmode_tss(struct kvm* kvm) gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; char *page; - p1 = _gfn_to_page(kvm, fn++); - p2 = _gfn_to_page(kvm, fn++); - p3 = _gfn_to_page(kvm, fn); + p1 = gfn_to_page(kvm, fn++); + p2 = gfn_to_page(kvm, fn++); + p3 = gfn_to_page(kvm, fn); if (!p1 || !p2 || !p3) { kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); -- cgit v1.2.3 From 3964994bb5ba85a3d8b54ae618f7be1cecce916d Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn / Snakebyte Date: Mon, 9 Apr 2007 16:15:05 +0200 Subject: KVM: Fix overflow bug in overflow detection code The expression sp - 6 < sp where sp is a u16 is undefined in C since 'sp - 6' is promoted to int, and signed overflow is undefined in C. gcc 4.2 actually warns about it. Replace with a simpler test. Signed-off-by: Eric Sesterhenn Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 61a611691e50..8c0115b54802 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1182,7 +1182,7 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) u16 sp = vmcs_readl(GUEST_RSP); u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); - if (sp > ss_limit || sp - 6 > sp) { + if (sp > ss_limit || sp < 6 ) { vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", __FUNCTION__, vmcs_readl(GUEST_RSP), -- cgit v1.2.3 From 2345df8c555ecb92c0c36172c07d5ac321a92dc7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Apr 2007 15:30:24 +0300 Subject: KVM: VMX: Reduce unnecessary saving of host msrs THe automatically switched msrs are never changed on the host (with the exception of MSR_KERNEL_GS_BASE) and thus there is no need to save them on every vm entry. This reduces vmexit latency by ~400 cycles on i386 and by ~900 cycles (10%) on x86_64. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 8c0115b54802..3745e6ccc5b4 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -78,6 +78,10 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) +#ifdef CONFIG_X86_64 +static unsigned msr_offset_kernel_gs_base; +#endif + static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1129,6 +1133,10 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->host_msrs[j].reserved = 0; vcpu->host_msrs[j].data = data; vcpu->guest_msrs[j] = vcpu->host_msrs[j]; +#ifdef CONFIG_X86_64 + if (index == MSR_KERNEL_GS_BASE) + msr_offset_kernel_gs_base = j; +#endif ++vcpu->nmsrs; } @@ -1760,7 +1768,9 @@ again: fx_save(vcpu->host_fx_image); fx_restore(vcpu->guest_fx_image); - save_msrs(vcpu->host_msrs, vcpu->nmsrs); +#ifdef CONFIG_X86_64 + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); +#endif load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); asm ( -- cgit v1.2.3 From e38aea3e9330624d19a233c05f3e69c57519edd5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 13:22:48 +0300 Subject: KVM: VMX: Don't switch 64-bit msrs for 32-bit guests Some msrs are only used by x86_64 instructions, and are therefore not needed when the guest is legacy mode. By not bothering to switch them, we reduce vmexit latency by 2400 cycles (from about 8800) when running a 32-bt guest on a 64-bit host. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 58 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 16 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 3745e6ccc5b4..6270df58e055 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -80,6 +80,9 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 static unsigned msr_offset_kernel_gs_base; +#define NR_64BIT_MSRS 4 +#else +#define NR_64BIT_MSRS 0 #endif static inline int is_page_fault(u32 intr_info) @@ -300,6 +303,32 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) INTR_INFO_VALID_MASK); } +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct kvm_vcpu *vcpu) +{ + int nr_skip, nr_good_msrs; + + if (is_long_mode(vcpu)) + nr_skip = NR_BAD_MSRS; + else + nr_skip = NR_64BIT_MSRS; + nr_good_msrs = vcpu->nmsrs - nr_skip; + + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, + virt_to_phys(vcpu->guest_msrs + nr_skip)); + vmcs_writel(VM_EXIT_MSR_STORE_ADDR, + virt_to_phys(vcpu->guest_msrs + nr_skip)); + vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, + virt_to_phys(vcpu->host_msrs + nr_skip)); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ +} + /* * reads and returns guest's timestamp counter "register" * guest_tsc = host_tsc + tsc_offset -- 21.3 @@ -825,6 +854,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } + setup_msrs(vcpu); } #endif @@ -988,7 +1018,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) struct descriptor_table dt; int i; int ret = 0; - int nr_good_msrs; extern asmlinkage void kvm_vmx_return(void); if (!init_rmode_tss(vcpu->kvm)) { @@ -1140,19 +1169,10 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ++vcpu->nmsrs; } - nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); - vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); - vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS)); + setup_msrs(vcpu); + vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - /* 22.2.1, 20.8.1 */ vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, @@ -1769,9 +1789,11 @@ again: fx_restore(vcpu->guest_fx_image); #ifdef CONFIG_X86_64 - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + if (is_long_mode(vcpu)) { + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + } #endif - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); asm ( /* Store host registers */ @@ -1915,8 +1937,12 @@ again: } ++kvm_stat.exits; - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); -- cgit v1.2.3 From 35cc7f971188366f5a5c0d5da1456bb38cef5da9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 13:26:39 +0300 Subject: KVM: Fold drivers/kvm/kvm_vmx.h into drivers/kvm/vmx.c No meat in that file. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 6270df58e055..b61d4dd804e3 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -17,7 +17,6 @@ #include "kvm.h" #include "vmx.h" -#include "kvm_vmx.h" #include #include #include @@ -81,8 +80,14 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 static unsigned msr_offset_kernel_gs_base; #define NR_64BIT_MSRS 4 +/* + * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt + * mechanism (cpu bug AA24) + */ +#define NR_BAD_MSRS 2 #else #define NR_64BIT_MSRS 0 +#define NR_BAD_MSRS 0 #endif static inline int is_page_fault(u32 intr_info) -- cgit v1.2.3 From 4d56c8a787aefb2e3fc4ac4be966db96c14d1ad8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 14:28:44 +0300 Subject: KVM: VMX: Only save/restore MSR_K6_STAR if necessary Intel hosts only support syscall/sysret in long more (and only if efer.sce is enabled), so only reload the related MSR_K6_STAR if the guest will actually be able to use it. This reduces vmexit cost by about 500 cycles (6400 -> 5870) on my setup. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b61d4dd804e3..37537af126d1 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -69,6 +69,10 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +/* + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, @@ -323,6 +327,18 @@ static void setup_msrs(struct kvm_vcpu *vcpu) nr_skip = NR_64BIT_MSRS; nr_good_msrs = vcpu->nmsrs - nr_skip; + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + if (find_msr_entry(vcpu, MSR_K6_STAR)) { + --nr_good_msrs; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) + ++nr_good_msrs; +#endif + } + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, virt_to_phys(vcpu->guest_msrs + nr_skip)); vmcs_writel(VM_EXIT_MSR_STORE_ADDR, -- cgit v1.2.3 From 1165f5fec18c077bdba88e7125fd41f8e3617cb4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 17:27:43 +0300 Subject: KVM: Per-vcpu statistics Make the exit statistics per-vcpu instead of global. This gives a 3.5% boost when running one virtual machine per core on my two socket dual core (4 cores total) machine. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 37537af126d1..10845b7ff297 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1396,7 +1396,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: - ++kvm_stat.mmio_exits; + ++vcpu->stat.mmio_exits; kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; case EMULATE_FAIL: @@ -1425,7 +1425,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - ++kvm_stat.irq_exits; + ++vcpu->stat.irq_exits; return 1; } @@ -1492,7 +1492,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long count; gva_t address; - ++kvm_stat.io_exits; + ++vcpu->stat.io_exits; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); in = (exit_qualification & 8) != 0; size = (exit_qualification & 7) + 1; @@ -1682,7 +1682,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++kvm_stat.irq_window_exits; + ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -1695,7 +1695,7 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; - ++kvm_stat.halt_exits; + ++vcpu->stat.halt_exits; return 0; } @@ -1956,7 +1956,7 @@ again: reload_tss(); } - ++kvm_stat.exits; + ++vcpu->stat.exits; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1988,14 +1988,14 @@ again: if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { - ++kvm_stat.signal_exits; + ++vcpu->stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++kvm_stat.request_irq_exits; + ++vcpu->stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; @@ -2021,7 +2021,7 @@ static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, { u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - ++kvm_stat.pf_guest; + ++vcpu->stat.pf_guest; if (is_page_fault(vect_info)) { printk(KERN_DEBUG "inject_page_fault: " -- cgit v1.2.3 From e0e5127d06957e76da3906b7a58d5d2665e81f59 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 25 Apr 2007 10:59:52 +0300 Subject: KVM: Don't complain about cpu erratum AA15 It slows down Windows x64 horribly. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 10845b7ff297..d28c848138ce 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1566,8 +1566,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) skip_emulated_instruction(vcpu); return 1; case 8: - printk(KERN_DEBUG "handle_cr: read CR8 " - "cpu erratum AA15\n"); vcpu_load_rsp_rip(vcpu); vcpu->regs[reg] = vcpu->cr8; vcpu_put_rsp_rip(vcpu); -- cgit v1.2.3 From 25c4c2762e31a75403eca0dd59f2cab85e3a2532 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Fri, 27 Apr 2007 09:29:21 +0300 Subject: KVM: VMX: Properly shadow the CR0 register in the vcpu struct Set all of the host mask bits for CR0 so that we can maintain a proper shadow of CR0. This exposes CR0.TS, paving the way for lazy fpu handling. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index d28c848138ce..09608114e29a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -810,11 +810,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->cr0 &= KVM_GUEST_CR0_MASK; - vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK; - vcpu->cr4 &= KVM_GUEST_CR4_MASK; vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } @@ -1205,7 +1202,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_writel(TPR_THRESHOLD, 0); #endif - vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); vcpu->cr0 = 0x60000010; @@ -1557,6 +1554,11 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; }; break; + case 2: /* clts */ + vcpu_load_rsp_rip(vcpu); + set_cr0(vcpu, vcpu->cr0 & ~CR0_TS_MASK); + skip_emulated_instruction(vcpu); + return 1; case 1: /*mov from cr*/ switch (cr) { case 3: @@ -2112,7 +2114,7 @@ static struct kvm_arch_ops vmx_arch_ops = { .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, -- cgit v1.2.3 From 2ab455ccceb07945368709ba852e49f4c3119331 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Fri, 27 Apr 2007 09:29:49 +0300 Subject: KVM: VMX: Add lazy FPU support for VT Only save/restore the FPU host state when the guest is actually using the FPU. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 09608114e29a..5a2a68dec6bf 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -101,6 +101,13 @@ static inline int is_page_fault(u32 intr_info) (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } +static inline int is_no_device(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -216,6 +223,16 @@ static void vmcs_write64(unsigned long field, u64 value) #endif } +static void vmcs_clear_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -833,6 +850,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif + if (!(cr0 & CR0_TS_MASK)) { + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); + } + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); @@ -842,6 +864,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); + + if (!(vcpu->cr0 & CR0_TS_MASK)) { + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + } } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1368,6 +1396,15 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm ("int $2"); return 1; } + + if (is_no_device(intr_info)) { + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + if (!(vcpu->cr0 & CR0_TS_MASK)) + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + return 1; + } + error_code = 0; rip = vmcs_readl(GUEST_RIP); if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) @@ -1556,7 +1593,11 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) break; case 2: /* clts */ vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->cr0 & ~CR0_TS_MASK); + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vcpu->cr0 &= ~CR0_TS_MASK; + vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -1806,8 +1847,14 @@ again: if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + } + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1965,8 +2012,11 @@ again: } #endif - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + } + vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); @@ -2078,6 +2128,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; + vcpu->fpu_active = 1; return 0; -- cgit v1.2.3 From 2ff81f70b56dc1cdd3bf2f08414608069db6ef1a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 29 Apr 2007 16:25:49 +0300 Subject: KVM: Remove unused 'instruction_length' As we no longer emulate in userspace, this is meaningless. We don't compute it on SVM anyway. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/kvm/vmx.c') diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 5a2a68dec6bf..724db0027f00 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1783,7 +1783,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_EXCEPTION_NMI ) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __FUNCTION__, exit_reason); - kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); -- cgit v1.2.3