From fc3dffe12148b9612870eb21b24f2aecefa9ea24 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 22 Jan 2007 20:40:40 -0800 Subject: [PATCH] KVM: fix bogus pagefault on writable pages If a page is marked as dirty in the guest pte, set_pte_common() can set the writable bit on newly-instantiated shadow pte. This optimization avoids a write fault after the initial read fault. However, if a write fault instantiates the pte, fix_write_pf() incorrectly reports the fault as a guest page fault, and the guest oopses on what appears to be a correctly-mapped page. Fix is to detect the condition and only report a guest page fault on a user access to a kernel page. With the fix, a kvm guest can survive a whole night of running the kernel hacker's screensaver (make -j9 in a loop). Signed-off-by: Avi Kivity Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/kvm/paging_tmpl.h') diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 2dbf4307ed9e..6bc41950fbb3 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -274,7 +274,7 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page; if (is_writeble_pte(*shadow_ent)) - return 0; + return !user || (*shadow_ent & PT_USER_MASK); writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; if (user) { -- cgit v1.2.3 From 7993ba43db1c07245ada067791f91dbf018095ac Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 26 Jan 2007 00:56:41 -0800 Subject: [PATCH] KVM: MMU: Perform access checks in walk_addr() Check pte permission bits in walk_addr(), instead of scattering the checks all over the code. This has the following benefits: 1. We no longer set the accessed bit for accessed which fail permission checks. 2. Setting the accessed bit is simplified. 3. Under some circumstances, we used to pretend a page fault was fixed when it would actually fail the access checks. This caused an unnecessary vmexit. 4. The error code for guest page faults is now correct. The fix helps netbsd further along booting, and allows kvm to pass the new mmu testsuite. Signed-off-by: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/kvm/paging_tmpl.h | 68 ++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 30 deletions(-) (limited to 'drivers/kvm/paging_tmpl.h') diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 6bc41950fbb3..afcd2a8f45bb 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -63,13 +63,15 @@ struct guest_walker { pt_element_t *ptep; pt_element_t inherited_ar; gfn_t gfn; + u32 error_code; }; /* * Fetch a guest pte for a guest virtual address */ -static void FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr) +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault) { hpa_t hpa; struct kvm_memory_slot *slot; @@ -86,7 +88,7 @@ static void FNAME(walk_addr)(struct guest_walker *walker, walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; root = *walker->ptep; if (!(root & PT_PRESENT_MASK)) - return; + goto not_present; --walker->level; } #endif @@ -111,11 +113,18 @@ static void FNAME(walk_addr)(struct guest_walker *walker, ASSERT(((unsigned long)walker->table & PAGE_MASK) == ((unsigned long)ptep & PAGE_MASK)); - if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK)) - *ptep |= PT_ACCESSED_MASK; - if (!is_present_pte(*ptep)) - break; + goto not_present; + + if (write_fault && !is_writeble_pte(*ptep)) + if (user_fault || is_write_protection(vcpu)) + goto access_error; + + if (user_fault && !(*ptep & PT_USER_MASK)) + goto access_error; + + if (!(*ptep & PT_ACCESSED_MASK)) + *ptep |= PT_ACCESSED_MASK; /* avoid rmw */ if (walker->level == PT_PAGE_TABLE_LEVEL) { walker->gfn = (*ptep & PT_BASE_ADDR_MASK) @@ -146,6 +155,21 @@ static void FNAME(walk_addr)(struct guest_walker *walker, } walker->ptep = ptep; pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); + return 1; + +not_present: + walker->error_code = 0; + goto err; + +access_error: + walker->error_code = PFERR_PRESENT_MASK; + +err: + if (write_fault) + walker->error_code |= PFERR_WRITE_MASK; + if (user_fault) + walker->error_code |= PFERR_USER_MASK; + return 0; } static void FNAME(release_walker)(struct guest_walker *walker) @@ -347,7 +371,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code) { int write_fault = error_code & PFERR_WRITE_MASK; - int pte_present = error_code & PFERR_PRESENT_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; u64 *shadow_pte; @@ -365,19 +388,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * Look up the shadow pte for the faulting address. */ - FNAME(walk_addr)(&walker, vcpu, addr); - shadow_pte = FNAME(fetch)(vcpu, addr, &walker); + r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault); /* * The page is not mapped by the guest. Let the guest handle it. */ - if (!shadow_pte) { - pgprintk("%s: not mapped\n", __FUNCTION__); - inject_page_fault(vcpu, addr, error_code); + if (!r) { + pgprintk("%s: guest page fault\n", __FUNCTION__); + inject_page_fault(vcpu, addr, walker.error_code); FNAME(release_walker)(&walker); return 0; } + shadow_pte = FNAME(fetch)(vcpu, addr, &walker); pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, shadow_pte, *shadow_pte); @@ -399,22 +422,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, * mmio: emulate if accessible, otherwise its a guest fault. */ if (is_io_pte(*shadow_pte)) { - if (may_access(*shadow_pte, write_fault, user_fault)) - return 1; - pgprintk("%s: io work, no access\n", __FUNCTION__); - inject_page_fault(vcpu, addr, - error_code | PFERR_PRESENT_MASK); - kvm_mmu_audit(vcpu, "post page fault (io)"); - return 0; - } - - /* - * pte not present, guest page fault. - */ - if (pte_present && !fixed && !write_pt) { - inject_page_fault(vcpu, addr, error_code); - kvm_mmu_audit(vcpu, "post page fault (guest)"); - return 0; + return 1; } ++kvm_stat.pf_fixed; @@ -429,7 +437,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) pt_element_t guest_pte; gpa_t gpa; - FNAME(walk_addr)(&walker, vcpu, vaddr); + FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0); guest_pte = *walker.ptep; FNAME(release_walker)(&walker); -- cgit v1.2.3 From 73b1087e6176a34c01eea3db269848f72fad72c1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 26 Jan 2007 00:56:41 -0800 Subject: [PATCH] KVM: MMU: Report nx faults to the guest With the recent guest page fault change, we perform access checks on our own instead of relying on the cpu. This means we have to perform the nx checks as well. Software like the google toolbar on windows appears to rely on this somehow. Signed-off-by: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/kvm/paging_tmpl.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'drivers/kvm/paging_tmpl.h') diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index afcd2a8f45bb..149fa45fd9a5 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -71,7 +71,7 @@ struct guest_walker { */ static int FNAME(walk_addr)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault) + int write_fault, int user_fault, int fetch_fault) { hpa_t hpa; struct kvm_memory_slot *slot; @@ -123,6 +123,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker, if (user_fault && !(*ptep & PT_USER_MASK)) goto access_error; +#if PTTYPE == 64 + if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) + goto access_error; +#endif + if (!(*ptep & PT_ACCESSED_MASK)) *ptep |= PT_ACCESSED_MASK; /* avoid rmw */ @@ -169,6 +174,8 @@ err: walker->error_code |= PFERR_WRITE_MASK; if (user_fault) walker->error_code |= PFERR_USER_MASK; + if (fetch_fault) + walker->error_code |= PFERR_FETCH_MASK; return 0; } @@ -372,6 +379,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; + int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *shadow_pte; int fixed; @@ -388,7 +396,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * Look up the shadow pte for the faulting address. */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault); + r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, + fetch_fault); /* * The page is not mapped by the guest. Let the guest handle it. @@ -437,7 +446,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) pt_element_t guest_pte; gpa_t gpa; - FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0); + FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); guest_pte = *walker.ptep; FNAME(release_walker)(&walker); -- cgit v1.2.3