diff options
Diffstat (limited to 'drivers')
| -rw-r--r-- | drivers/lguest/core.c | 7 | ||||
| -rw-r--r-- | drivers/lguest/hypercalls.c | 6 | ||||
| -rw-r--r-- | drivers/lguest/lguest_device.c | 11 | ||||
| -rw-r--r-- | drivers/lguest/lguest_user.c | 100 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 84 | ||||
| -rw-r--r-- | drivers/lguest/x86/core.c | 2 | ||||
| -rw-r--r-- | drivers/lguest/x86/switcher_32.S | 6 | 
7 files changed, 176 insertions, 40 deletions
| diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)  		/*  		 * It's possible the Guest did a NOTIFY hypercall to the -		 * Launcher, in which case we return from the read() now. +		 * Launcher.  		 */  		if (cpu->pending_notify) { +			/* +			 * Does it just needs to write to a registered +			 * eventfd (ie. the appropriate virtqueue thread)? +			 */  			if (!send_notify_to_eventfd(cpu)) { +				/* OK, we tell the main Laucher. */  				if (put_user(cpu->pending_notify, user))  					return -EFAULT;  				return sizeof(cpu->pending_notify); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)  	case LHCALL_SHUTDOWN: {  		char msg[128];  		/* -		 * Shutdown is such a trivial hypercall that we do it in four +		 * Shutdown is such a trivial hypercall that we do it in five  		 * lines right here.  		 *  		 * If the lgread fails, it will call kill_guest() itself; the @@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu)   * device), the Guest will still see the old page.  In practice, this never   * happens: why would the Guest read a page which it has never written to?  But   * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write.  And we + * need that to switch the Launcher to processes (away from threads) anyway.  :*/  /*H:100 diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq)  extern void lguest_setup_irq(unsigned int irq);  /* - * This routine finds the first virtqueue described in the configuration of + * This routine finds the Nth virtqueue described in the configuration of   * this device and sets it up.   *   * This is kind of an ugly duckling.  It'd be nicer to have a standard @@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq);   * everyone wants to do it differently.  The KVM coders want the Guest to   * allocate its own pages and tell the Host where they are, but for lguest it's   * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function.   */  static struct virtqueue *lg_find_vq(struct virtio_device *vdev,  				    unsigned index, @@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d,  	/* This devices' parent is the lguest/ dir. */  	ldev->vdev.dev.parent = lguest_root; -	/* We have a unique device index thanks to the dev_index counter. */ +	/* +	 * The device type comes straight from the descriptor.  There's also a +	 * device vendor field in the virtio_device struct, which we leave as +	 * 0. +	 */  	ldev->vdev.id.device = d->type;  	/*  	 * We have a simple set of routines for querying the device's diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,9 +1,8 @@ -/*P:200 - * This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher   * controls and communicates with the Guest.  For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset.  A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. + * tell us the Guest's memory layout and entry point.  A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher.  :*/  #include <linux/uaccess.h>  #include <linux/miscdevice.h> @@ -13,14 +12,41 @@  #include <linux/file.h>  #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds.  That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */  bool send_notify_to_eventfd(struct lg_cpu *cpu)  {  	unsigned int i;  	struct lg_eventfd_map *map; -	/* lg->eventfds is RCU-protected */ +	/* +	 * This "rcu_read_lock()" helps track when someone is still looking at +	 * the (RCU-using) eventfds array.  It's not actually a lock at all; +	 * indeed it's a noop in many configurations.  (You didn't expect me to +	 * explain all the RCU secrets here, did you?) +	 */  	rcu_read_lock(); +	/* +	 * rcu_dereference is the counter-side of rcu_assign_pointer(); it +	 * makes sure we don't access the memory pointed to by +	 * cpu->lg->eventfds before cpu->lg->eventfds is set.  Sounds crazy, +	 * but Alpha allows this!  Paul McKenney points out that a really +	 * aggressive compiler could have the same effect: +	 *   http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html +	 * +	 * So play safe, use rcu_dereference to get the rcu-protected pointer: +	 */  	map = rcu_dereference(cpu->lg->eventfds); +	/* +	 * Simple array search: even if they add an eventfd while we do this, +	 * we'll continue to use the old array and just won't see the new one. +	 */  	for (i = 0; i < map->num; i++) {  		if (map->map[i].addr == cpu->pending_notify) {  			eventfd_signal(map->map[i].event, 1); @@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)  			break;  		}  	} +	/* We're done with the rcu-protected variable cpu->lg->eventfds. */  	rcu_read_unlock(); + +	/* If we cleared the notification, it's because we found a match. */  	return cpu->pending_notify == 0;  } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update.  Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here.  (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens.  But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element.  Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it.  That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */  static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)  {  	struct lg_eventfd_map *new, *old = lg->eventfds; +	/* +	 * We don't allow notifications on value 0 anyway (pending_notify of +	 * 0 means "nothing pending"). +	 */  	if (!addr)  		return -EINVAL; @@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)  	}  	new->num++; -	/* Now put new one in place. */ +	/* +	 * Now put new one in place: rcu_assign_pointer() is a fancy way of +	 * doing "lg->eventfds = new", but it uses memory barriers to make +	 * absolutely sure that the contents of "new" written above is nailed +	 * down before we actually do the assignment. +	 * +	 * We have to think about these kinds of things when we're operating on +	 * live data without locks. +	 */  	rcu_assign_pointer(lg->eventfds, new);  	/*  	 * We're not in a big hurry.  Wait until noone's looking at old -	 * version, then delete it. +	 * version, then free it.  	 */  	synchronize_rcu();  	kfree(old); @@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)  	return 0;  } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor.  The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */  static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)  {  	unsigned long addr, fd; @@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)  	if (get_user(fd, input) != 0)  		return -EFAULT; +	/* +	 * Just make sure two callers don't add eventfds at once.  We really +	 * only need to lock against callers adding to the same Guest, so using +	 * the Big Lguest Lock is overkill.  But this is setup, not a fast path. +	 */  	mutex_lock(&lguest_lock);  	err = add_eventfd(lg, addr, fd);  	mutex_unlock(&lguest_lock); @@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)  	if (irq >= LGUEST_IRQS)  		return -EINVAL; +	/* +	 * Next time the Guest runs, the core code will see if it can deliver +	 * this interrupt. +	 */  	set_interrupt(cpu, irq);  	return 0;  } @@ -307,10 +387,10 @@ unlock:   * The first operation the Launcher does must be a write.  All writes   * start with an unsigned long number: for the first write this must be   * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications.   *   * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with.  Currently this is always 0, since we only + * CPU number we're dealing with.  Currently this is always 0 since we only   * support uniprocessor Guests, but you can see the beginnings of SMP support   * here.   */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -29,10 +29,10 @@  /*H:300   * The Page Table Code   * - * We use two-level page tables for the Guest.  If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE.  If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!).   *   * The Guest keeps page tables, but we maintain the actual ones here: these are   * called "shadow" page tables.  Which is a very Guest-centric name: these are @@ -52,9 +52,8 @@  :*/  /* - * 1024 entries in a page table page maps 1024 pages: 4MB.  The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. + * The Switcher uses the complete top PTE page.  That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB).   */  #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) @@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);  /*H:320   * The page table code is curly enough to need helper functions to keep it - * clear and clean. + * clear and clean.  The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting.   *   * There are two functions which return pointers to the shadow (aka "real")   * page tables. @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)  }  /* - * These two functions just like the above two, except they access the Guest + * These functions are just like the above two, except they access the Guest   * page tables.  Hence they return a Guest address.   */  static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)  }  #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */  static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)  {  	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)  	return gpage + pmd_index(vaddr) * sizeof(pmd_t);  } +/* Follow the PMD to the PTE. */  static unsigned long gpte_addr(struct lg_cpu *cpu,  			       pmd_t gpmd, unsigned long vaddr)  { @@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,  	return gpage + pte_index(vaddr) * sizeof(pte_t);  }  #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */  static unsigned long gpte_addr(struct lg_cpu *cpu,  				pgd_t gpgd, unsigned long vaddr)  { @@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	pte_t gpte;  	pte_t *spte; +	/* Mid level for PAE. */  #ifdef CONFIG_X86_PAE  	pmd_t *spmd;  	pmd_t gpmd; @@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	 */  	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);  #endif + +	/* Read the actual PTE value. */  	gpte = lgread(cpu, gpte_ptr, pte_t);  	/* If this page isn't in the Guest page tables, we can't page it in. */ @@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)  	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))  		kill_guest(cpu, "bad stack page %#lx", vaddr);  } +/*:*/  #ifdef CONFIG_X86_PAE  static void release_pmd(pmd_t *spmd) @@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd)  }  #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this.  The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */  static void release_pgd(pgd_t *spgd)  {  	/* If the entry's not present, there's nothing to release. */ @@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)  		/* ... throw it away. */  		release_pgd(lg->pgdirs[pgdir].pgdir + idx);  } +  #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away.  It's easy. */  void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)  {  	guest_pagetable_clear_all(&lg->cpus[0]);  }  #endif -/* - * Once we know how much memory we have we can construct simple identity (which +/*H:505 + * To get through boot, we construct simple identity page mappings (which   * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. + * enough into the boot to create its own.  The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see.   *   * We lay them out of the way, just below the initrd (which is why we need to   * know its size here). @@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg,  	linear = (void *)pgdir - linear_pages * PAGE_SIZE;  #ifdef CONFIG_X86_PAE +	/* +	 * And the single mid page goes below that.  We only use one, but +	 * that's enough to map 1G, which definitely gets us through boot. +	 */  	pmds = (void *)linear - PAGE_SIZE;  #endif  	/* @@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg,  			return -EFAULT;  	} +#ifdef CONFIG_X86_PAE  	/* -	 * The top level points to the linear page table pages above. -	 * We setup the identity and linear mappings here. +	 * Make the Guest PMD entries point to the corresponding place in the +	 * linear mapping (up to one page worth of PMD).  	 */ -#ifdef CONFIG_X86_PAE  	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;  	     i += PTRS_PER_PTE, j++) { +		/* FIXME: native_set_pmd is overkill here. */  		native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)  		- mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg,  			return -EFAULT;  	} +	/* One PGD entry, pointing to that PMD page. */  	set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); +	/* Copy it in as the first PGD entry (ie. addresses 0-1G). */  	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)  		return -EFAULT; +	/* +	 * And the third PGD entry (ie. addresses 3G-4G). +	 * +	 * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. +	 */  	if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)  		return -EFAULT;  #else +	/* +	 * The top level points to the linear page table pages above. +	 * We setup the identity and linear mappings here. +	 */  	phys_linear = (unsigned long)linear - mem_base;  	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {  		pgd_t pgd; +		/* +		 * Create a PGD entry which points to the right part of the +		 * linear PTE pages. +		 */  		pgd = __pgd((phys_linear + i * sizeof(pte_t)) |  			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); +		/* +		 * Copy it into the PGD page at 0 and PAGE_OFFSET. +		 */  		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))  		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)  					   + i / PTRS_PER_PTE], @@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg,  #endif  	/* -	 * We return the top level (guest-physical) address: remember where -	 * this is. +	 * We return the top level (guest-physical) address: we remember where +	 * this is to write it into lguest_data when the Guest initializes.  	 */  	return (unsigned long)pgdir - mem_base;  } @@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg)  	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);  	if (!lg->pgdirs[0].pgdir)  		return -ENOMEM; +  #ifdef CONFIG_X86_PAE +	/* For PAE, we also create the initial mid-level. */  	pgd = lg->pgdirs[0].pgdir;  	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);  	if (!pmd_table) @@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg)  	set_pgd(pgd + SWITCHER_PGD_INDEX,  		__pgd(__pa(pmd_table) | _PAGE_PRESENT));  #endif + +	/* This is the current page table. */  	lg->cpus[0].cpu_pgd = 0;  	return 0;  } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */  void page_table_guest_data_init(struct lg_cpu *cpu)  {  	/* We get the kernel address: above this is all kernel memory. */ @@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)  	pmd_t switcher_pmd;  	pmd_t *pmd_table; +	/* FIXME: native_set_pmd is overkill here. */  	native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>  		       PAGE_SHIFT, PAGE_KERNEL_EXEC)); +	/* Figure out where the pmd page is, by reading the PGD, and converting +	 * it to a virtual address. */  	pmd_table = __va(pgd_pfn(cpu->lg->  			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])  								<< PAGE_SHIFT); +	/* Now write it into the shadow page table. */  	native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);  #else  	pgd_t switcher_pgd; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)   * also simplify copy_in_guest_info().  Note that we'd still need to restore   * things when we exit to Launcher userspace, but that's fairly easy.   * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive.   *   * The hooks were designed for KVM, but we can also put them to good use.  :*/ diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,7 +1,7 @@  /*P:900 - * This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch.  It is as simple as - * it can be made, but it's naturally very specific to x86. + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch.  It is as + * simple as it can be made, but it's naturally very specific to x86.   *   * You have now completed Preparation.  If this has whet your appetite; if you   * are feeling invigorated and refreshed then the next, more challenging stage | 
