From f73d1e6ca6985b43a1871467463cba632fbc624d Mon Sep 17 00:00:00 2001 From: Eugene Teo Date: Sat, 9 Feb 2008 23:53:17 +0800 Subject: lguest: make sure cpu is initialized before accessing it If req is LHREQ_INITIALIZE, and the guest has been initialized before (unlikely), it will attempt to access cpu->tsk even though cpu is not yet initialized. Signed-off-by: Eugene Teo Signed-off-by: Rusty Russell --- drivers/lguest/lguest_user.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 85d42d3d01a9..2221485b0773 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -241,15 +241,16 @@ static ssize_t write(struct file *file, const char __user *in, cpu = &lg->cpus[cpu_id]; if (!cpu) return -EINVAL; - } - /* Once the Guest is dead, all you can do is read() why it died. */ - if (lg && lg->dead) - return -ENOENT; + /* Once the Guest is dead, you can only read() why it died. */ + if (lg->dead) + return -ENOENT; - /* If you're not the task which owns the Guest, you can only break */ - if (lg && current != cpu->tsk && req != LHREQ_BREAK) - return -EPERM; + /* If you're not the task which owns the Guest, all you can do + * is break the Launcher out of running the Guest. */ + if (current != cpu->tsk && req != LHREQ_BREAK) + return -EPERM; + } switch (req) { case LHREQ_INITIALIZE: -- cgit v1.2.3 From f14ae652baa3d72ae378f0c06b89cc2c4ef15ff8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 11 Mar 2008 09:35:56 -0500 Subject: lguest: fix __get_vm_area usage. Robert Bragg's 5dc331852848a38ca00a2817e5b98a1d0561b116 tightened (ie. fixed) the checking in __get_vm_area, and it broke lguest. lguest should pass the exact "end" it wants, not some random constant (it was possible previously that it would actually get an address different from SWITCHER_ADDR). Also, Fabio Checconi pointed out that we should make sure we're not hitting the fixmap area. Signed-off-by: Rusty Russell Cc: Robert Bragg --- drivers/lguest/core.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 7743d73768df..c632c08cbbdc 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -69,11 +69,22 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } + /* First we check that the Switcher won't overlap the fixmap area at + * the top of memory. It's currently nowhere near, but it could have + * very strange effects if it ever happened. */ + if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ + err = -ENOMEM; + printk("lguest: mapping switcher would thwack fixmap\n"); + goto free_pages; + } + /* Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice - * it's worked so far. */ + * it's worked so far. The end address needs +1 because __get_vm_area + * allocates an extra guard page, so we need space for that. */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); + VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); if (!switcher_vma) { err = -ENOMEM; printk("lguest: could not map switcher pages high\n"); -- cgit v1.2.3 From 3fabc55f34b72720e8a10aa442bd3415a211edb3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 11 Mar 2008 09:35:56 -0500 Subject: lguest: Sanitize the lguest clock. Now the TSC code handles a zero return from calculate_cpu_khz(), lguest can simply pass through the value it gets from the Host: if non-zero, all the normal TSC code applies. Otherwise (or if the Host really doesn't support TSC), the clocksource code will fall back to the slower but reasonable lguest clock. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 53 ++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index cccb38a59653..9c27c104d83c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -84,7 +84,6 @@ struct lguest_data lguest_data = { .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = SYSCALL_VECTOR, }; -static cycle_t clock_base; /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we @@ -327,8 +326,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ - *dx &= 0x07808101; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ + *dx &= 0x07808111; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -595,19 +594,25 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } +/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, + * or 0 if it's unusable as a reliable clock source. This matches what we want + * here: if we return 0 from this function, the x86 TSC clock will not register + * itself. */ +static unsigned long lguest_cpu_khz(void) +{ + return lguest_data.tsc_khz; +} + +/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where + * we read the time value given to us by the Host. */ static cycle_t lguest_clock_read(void) { unsigned long sec, nsec; - /* If the Host tells the TSC speed, we can trust that. */ - if (lguest_data.tsc_khz) - return native_read_tsc(); - - /* If we can't use the TSC, we read the time value written by the Host. - * Since it's in two parts (seconds and nanoseconds), we risk reading - * it just as it's changing from 99 & 0.999999999 to 100 and 0, and - * getting 99 and 0. As Linux tends to come apart under the stress of - * time travel, we must be careful: */ + /* Since the time is in two parts (seconds and nanoseconds), we risk + * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, + * and getting 99 and 0. As Linux tends to come apart under the stress + * of time travel, we must be careful: */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; @@ -622,14 +627,14 @@ static cycle_t lguest_clock_read(void) /* Now if the seconds part has changed, try again. */ } while (unlikely(lguest_data.time.tv_sec != sec)); - /* Our non-TSC clock is in real nanoseconds. */ + /* Our lguest clock is in real nanoseconds. */ return sec*1000000000ULL + nsec; } -/* This is what we tell the kernel is our clocksource. */ +/* This is the fallback clocksource: lower priority than the TSC clocksource. */ static struct clocksource lguest_clock = { .name = "lguest", - .rating = 400, + .rating = 200, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), .mult = 1 << 22, @@ -637,12 +642,6 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* The "scheduler clock" is just our real clock, adjusted to start at zero */ -static unsigned long long lguest_sched_clock(void) -{ - return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); -} - /* We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I * just applied the patch. */ @@ -712,19 +711,8 @@ static void lguest_time_init(void) /* Set up the timer interrupt (0) to go to our simple timer routine */ set_irq_handler(0, lguest_time_irq); - /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can - * use the TSC, otherwise it's a dumb nanosecond-resolution clock. - * Either way, the "rating" is set so high that it's always chosen over - * any other clocksource. */ - if (lguest_data.tsc_khz) - lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, - lguest_clock.shift); - clock_base = lguest_clock_read(); clocksource_register(&lguest_clock); - /* Now we've set up our clock, we can use it as the scheduler clock */ - pv_time_ops.sched_clock = lguest_sched_clock; - /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ lguest_clockevent.cpumask = cpumask_of_cpu(0); @@ -995,6 +983,7 @@ __init void lguest_init(void) /* time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; + pv_time_ops.get_cpu_khz = lguest_cpu_khz; /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). */ -- cgit v1.2.3 From 4357bd9453b81e0a41db1dec16e06d74256b7560 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 11 Mar 2008 09:35:57 -0500 Subject: lguest: Revert 1ce70c4fac3c3954bd48c035f448793867592bc0, fix real problem. Ahmed managed to crash the Host in release_pgd(), which cannot be a Guest bug, and indeed it wasn't. The bug was that handing a 0 as the address of the toplevel page table being manipulated can cause the lookup code in find_pgdir() to return an uninitialized cache entry (we shadow up to 4 top level page tables for each Guest). Commit 37cc8d7f963ba2deec29c9b68716944516a3244f introduced this behaviour in the Guest, uncovering the bug. The patch which he submitted (which removed the /4 from the index calculation) simply ensured that these high-indexed entries hit the early exit path of guest_set_pmd(). But you get lots of segfaults in guest userspace as the PMDs aren't being updated. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 2 +- drivers/lguest/page_tables.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9c27c104d83c..a104c532ff70 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -480,7 +480,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, - (__pa(pmdp)&(PAGE_SIZE-1)), 0); + (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 275f23c2deb4..a7f64a9d67e0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -391,7 +391,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].gpgdir == pgtable) + if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) break; return i; } -- cgit v1.2.3 From 1ef36fa64e65079de18ff5179a51af58e44d49a6 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 10 Mar 2008 16:39:03 +0100 Subject: lguest: Do not append space to guests kernel command line The lguest launcher appends a space to the kernel command line (if kernel arguments are specified on its command line). This space is unneeded. More importantly, this appended space will make Red Hat's nash script interpreter (used in a Fedora style initramfs) add an empty argument to init's command line. This empty argument will make kernel arguments like "init=/bin/bash" fail (because the shell will try to execute a script with an empty name). This could be considered a bug in nash, but is easily fixed in the lguest launcher too. Signed-off-by: Paul Bolle Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 0f23d67f958f..bec5a32e4095 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -486,9 +486,12 @@ static void concat(char *dst, char *args[]) unsigned int i, len = 0; for (i = 0; args[i]; i++) { + if (i) { + strcat(dst+len, " "); + len++; + } strcpy(dst+len, args[i]); - strcat(dst+len, " "); - len += strlen(args[i]) + 1; + len += strlen(args[i]); } /* In case it's empty. */ dst[len] = '\0'; -- cgit v1.2.3