diff options
| author | Ingo Molnar <mingo@elte.hu> | 2008-06-25 12:23:59 +0200 | 
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-06-25 12:23:59 +0200 | 
| commit | ace7f1b79670aa0c1d9f4b0442be82e565827333 (patch) | |
| tree | ed4625a9cbc8e28db2b1ff9fad111c4c012508a3 | |
| parent | a60b33cf59d1c9e0e363287fce799cb23d45660c (diff) | |
| parent | 543cf4cb3fe6f6cae3651ba918b9c56200b257d0 (diff) | |
Merge branch 'linus' into core/softirqtip-core-softirq-2008-06-25_10.23_Wed
49 files changed, 646 insertions, 499 deletions
| diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index 028a8444d95e..e8acd1f03456 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl @@ -84,10 +84,9 @@      runs an instance of gdb against the vmlinux file which contains      the symbols (not boot image such as bzImage, zImage, uImage...).      In gdb the developer specifies the connection parameters and -    connects to kgdb.  Depending on which kgdb I/O modules exist in -    the kernel for a given architecture, it may be possible to debug -    the test machine's kernel with the development machine using a -    rs232 or ethernet connection. +    connects to kgdb.  The type of connection a developer makes with +    gdb depends on the availability of kgdb I/O modules compiled as +    builtin's or kernel modules in the test machine's kernel.      </para>    </chapter>    <chapter id="CompilingAKernel"> @@ -223,7 +222,7 @@    </para>    <para>    IMPORTANT NOTE: Using this option with kgdb over the console -  (kgdboc) or kgdb over ethernet (kgdboe) is not supported. +  (kgdboc) is not supported.    </para>    </sect1>    </chapter> @@ -249,18 +248,11 @@      (gdb) target remote /dev/ttyS0      </programlisting>      <para> -    Example (kgdb to a terminal server): +    Example (kgdb to a terminal server on tcp port 2012):      </para>      <programlisting>      % gdb ./vmlinux -    (gdb) target remote udp:192.168.2.2:6443 -    </programlisting> -    <para> -    Example (kgdb over ethernet): -    </para> -    <programlisting> -    % gdb ./vmlinux -    (gdb) target remote udp:192.168.2.2:6443 +    (gdb) target remote 192.168.2.2:2012      </programlisting>      <para>      Once connected, you can debug a kernel the way you would debug an @@ -1,7 +1,7 @@  VERSION = 2  PATCHLEVEL = 6  SUBLEVEL = 26 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8  NAME = Rotary Wombat  # *DOCUMENTATION* diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 082c31dcfd99..39752cdef6ff 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void)  	if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {  		rte = alloc_bootmem(sizeof(struct iosapic_rte_info) *  				    NR_PREALLOCATE_RTE_ENTRIES); -		if (!rte) -			return NULL;  		for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)  			list_add(&rte->rte_list, &free_rte_list);  	} diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index f48a809c686d..4ae15c8c2488 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -578,8 +578,6 @@ setup_arch (char **cmdline_p)  	cpu_init();	/* initialize the bootstrap CPU */  	mmu_context_init();	/* initialize context_id bitmap */ -	check_sal_cache_flush(); -  #ifdef CONFIG_ACPI  	acpi_boot_init();  #endif @@ -607,6 +605,7 @@ setup_arch (char **cmdline_p)  		ia64_mca_init();  	platform_setup(cmdline_p); +	check_sal_cache_flush();  	paging_init();  } diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 6dd886c5d860..e585f9a2afb9 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si  	int cpu;  	char optstr[64]; -	if (count > sizeof(optstr)) +	if (count == 0 || count > sizeof(optstr))  		return -EINVAL;  	if (copy_from_user(optstr, user, count))  		return -EFAULT; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52e18e6d2ba0..e0edaaa6920a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -383,6 +383,7 @@ config VMI  config KVM_CLOCK  	bool "KVM paravirtualized clock"  	select PARAVIRT +	select PARAVIRT_CLOCK  	depends on !(X86_VISWS || X86_VOYAGER)  	help  	  Turning on this option will allow you to run a paravirtualized clock @@ -410,6 +411,10 @@ config PARAVIRT  	  over full virtualization.  However, when run without a hypervisor  	  the kernel is theoretically slower and slightly larger. +config PARAVIRT_CLOCK +	bool +	default n +  endif  config MEMTEST_BOOTPARAM diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..77807d4769c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o  obj-$(CONFIG_KVM_GUEST)		+= kvm.o  obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o  obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o  obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d472..87edf1ceb1df 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@  #include <linux/clocksource.h>  #include <linux/kvm_para.h> +#include <asm/pvclock.h>  #include <asm/arch_hooks.h>  #include <asm/msr.h>  #include <asm/apic.h> @@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)  early_param("no-kvmclock", parse_no_kvmclock);  /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field +static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_wall_clock wall_clock; -static inline u64 kvm_get_delta(u64 last_tsc) -{ -	int cpu = smp_processor_id(); -	u64 delta = native_read_tsc() - last_tsc; -	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} - -static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void);  /*   * The wallclock is the time of day when we booted. Since then, some time may   * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);   */  static unsigned long kvm_get_wallclock(void)  { -	u32 wc_sec, wc_nsec; -	u64 delta; +	struct pvclock_vcpu_time_info *vcpu_time;  	struct timespec ts; -	int version, nsec;  	int low, high;  	low = (int)__pa(&wall_clock);  	high = ((u64)__pa(&wall_clock) >> 32); +	native_write_msr(MSR_KVM_WALL_CLOCK, low, high); -	delta = kvm_clock_read(); +	vcpu_time = &get_cpu_var(hv_clock); +	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); +	put_cpu_var(hv_clock); -	native_write_msr(MSR_KVM_WALL_CLOCK, low, high); -	do { -		version = wall_clock.wc_version; -		rmb(); -		wc_sec = wall_clock.wc_sec; -		wc_nsec = wall_clock.wc_nsec; -		rmb(); -	} while ((wall_clock.wc_version != version) || (version & 1)); - -	delta = kvm_clock_read() - delta; -	delta += wc_nsec; -	nsec = do_div(delta, NSEC_PER_SEC); -	set_normalized_timespec(&ts, wc_sec + delta, nsec); -	/* -	 * Of all mechanisms of time adjustment I've tested, this one -	 * was the champion! -	 */ -	return ts.tv_sec + 1; +	return ts.tv_sec;  }  static int kvm_set_wallclock(unsigned long now)  { -	return 0; +	return -1;  } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */  static cycle_t kvm_clock_read(void)  { -	u64 last_tsc, now; -	int cpu; +	struct pvclock_vcpu_time_info *src; +	cycle_t ret; -	preempt_disable(); -	cpu = smp_processor_id(); - -	last_tsc = get_clock(cpu, tsc_timestamp); -	now = get_clock(cpu, system_time); - -	now += kvm_get_delta(last_tsc); -	preempt_enable(); - -	return now; +	src = &get_cpu_var(hv_clock); +	ret = pvclock_clocksource_read(src); +	put_cpu_var(hv_clock); +	return ret;  } +  static struct clocksource kvm_clock = {  	.name = "kvm-clock",  	.read = kvm_clock_read, @@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {  	.flags = CLOCK_SOURCE_IS_CONTINUOUS,  }; -static int kvm_register_clock(void) +static int kvm_register_clock(char *txt)  {  	int cpu = smp_processor_id();  	int low, high;  	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;  	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); - +	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", +	       cpu, high, low, txt);  	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);  } @@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)  	 * Now that the first cpu already had this clocksource initialized,  	 * we shouldn't fail.  	 */ -	WARN_ON(kvm_register_clock()); +	WARN_ON(kvm_register_clock("secondary cpu clock"));  	/* ok, done with our trickery, call native */  	setup_secondary_APIC_clock();  }  #endif +#ifdef CONFIG_SMP +void __init kvm_smp_prepare_boot_cpu(void) +{ +	WARN_ON(kvm_register_clock("primary cpu clock")); +	native_smp_prepare_boot_cpu(); +} +#endif +  /*   * After the clock is registered, the host will keep writing to the   * registered memory location. If the guest happens to shutdown, this memory @@ -174,7 +148,7 @@ void __init kvmclock_init(void)  		return;  	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { -		if (kvm_register_clock()) +		if (kvm_register_clock("boot clock"))  			return;  		pv_time_ops.get_wallclock = kvm_get_wallclock;  		pv_time_ops.set_wallclock = kvm_set_wallclock; @@ -182,6 +156,9 @@ void __init kvmclock_init(void)  #ifdef CONFIG_X86_LOCAL_APIC  		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;  #endif +#ifdef CONFIG_SMP +		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; +#endif  		machine_ops.shutdown  = kvm_shutdown;  #ifdef CONFIG_KEXEC  		machine_ops.crash_shutdown  = kvm_crash_shutdown; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,141 @@ +/*  paravirtual clock -- common code used by kvm/xen + +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 2 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, write to the Free Software +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA +*/ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <asm/pvclock.h> + +/* + * These are perodically updated + *    xen: magic shared_info page + *    kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { +	u64 tsc_timestamp;     /* TSC at last update of time vals.  */ +	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */ +	u32 tsc_to_nsec_mul; +	int tsc_shift; +	u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ +	u64 product; +#ifdef __i386__ +	u32 tmp1, tmp2; +#endif + +	if (shift < 0) +		delta >>= -shift; +	else +		delta <<= shift; + +#ifdef __i386__ +	__asm__ ( +		"mul  %5       ; " +		"mov  %4,%%eax ; " +		"mov  %%edx,%4 ; " +		"mul  %5       ; " +		"xor  %5,%5    ; " +		"add  %4,%%eax ; " +		"adc  %5,%%edx ; " +		: "=A" (product), "=r" (tmp1), "=r" (tmp2) +		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ +	__asm__ ( +		"mul %%rdx ; shrd $32,%%rdx,%%rax" +		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + +	return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ +	u64 delta = native_read_tsc() - shadow->tsc_timestamp; +	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, +					struct pvclock_vcpu_time_info *src) +{ +	do { +		dst->version = src->version; +		rmb();		/* fetch version before data */ +		dst->tsc_timestamp     = src->tsc_timestamp; +		dst->system_timestamp  = src->system_time; +		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul; +		dst->tsc_shift         = src->tsc_shift; +		rmb();		/* test version after fetching data */ +	} while ((src->version & 1) || (dst->version != src->version)); + +	return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ +	struct pvclock_shadow_time shadow; +	unsigned version; +	cycle_t ret, offset; + +	do { +		version = pvclock_get_time_values(&shadow, src); +		barrier(); +		offset = pvclock_get_nsec_offset(&shadow); +		ret = shadow.system_timestamp + offset; +		barrier(); +	} while (version != src->version); + +	return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, +			    struct pvclock_vcpu_time_info *vcpu_time, +			    struct timespec *ts) +{ +	u32 version; +	u64 delta; +	struct timespec now; + +	/* get wallclock at system boot */ +	do { +		version = wall_clock->version; +		rmb();		/* fetch version before time */ +		now.tv_sec  = wall_clock->sec; +		now.tv_nsec = wall_clock->nsec; +		rmb();		/* fetch time before checking version */ +	} while ((wall_clock->version & 1) || (version != wall_clock->version)); + +	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */ +	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + +	now.tv_nsec = do_div(delta, NSEC_PER_SEC); +	now.tv_sec = delta; + +	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f2f5d260874e..3829aa7b663f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)  	atomic_inc(&pt->pending);  	smp_mb__after_atomic_inc(); -	if (vcpu0 && waitqueue_active(&vcpu0->wq)) { -		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; -		wake_up_interruptible(&vcpu0->wq); +	if (vcpu0) { +		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); +		if (waitqueue_active(&vcpu0->wq)) { +			vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; +			wake_up_interruptible(&vcpu0->wq); +		}  	}  	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c297c50eba63..ebc03f5ae162 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)  	wait_queue_head_t *q = &apic->vcpu->wq;  	atomic_inc(&apic->timer.pending); +	set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);  	if (waitqueue_active(q)) {  		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;  		wake_up_interruptible(q); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee3f53098f0c..7e7c3969f7a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)  			rmap_remove(kvm, spte);  			--kvm->stat.lpages;  			set_shadow_pte(spte, shadow_trap_nonpresent_pte); +			spte = NULL;  			write_protected = 1;  		}  		spte = rmap_next(kvm, rmapp, spte); @@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,  		struct kvm_mmu_page *shadow;  		spte |= PT_WRITABLE_MASK; -		if (user_fault) { -			mmu_unshadow(vcpu->kvm, gfn); -			goto unshadowed; -		}  		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);  		if (shadow || @@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,  		}  	} -unshadowed: -  	if (pte_access & ACC_WRITE_MASK)  		mark_page_dirty(vcpu->kvm, gfn); @@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,  				  u64 *spte,  				  const void *new)  { -	if ((sp->role.level != PT_PAGE_TABLE_LEVEL) -	    && !vcpu->arch.update_pte.largepage) { -		++vcpu->kvm->stat.mmu_pde_zapped; -		return; -	} +	if (sp->role.level != PT_PAGE_TABLE_LEVEL) { +		if (!vcpu->arch.update_pte.largepage || +		    sp->role.glevels == PT32_ROOT_LEVEL) { +			++vcpu->kvm->stat.mmu_pde_zapped; +			return; +		} +        }  	++vcpu->kvm->stat.mmu_pte_updated;  	if (sp->role.glevels == PT32_ROOT_LEVEL) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02efbe75f317..540e95179074 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)  	load_transition_efer(vmx);  } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void __vmx_load_host_state(struct vcpu_vmx *vmx)  {  	unsigned long flags; @@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)  	reload_host_efer(vmx);  } +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ +	preempt_disable(); +	__vmx_load_host_state(vmx); +	preempt_enable(); +} +  /*   * Switches to specified vcpu, until a matching vcpu_put(), but assumes   * vcpu mutex is already taken. @@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)  { -	vmx_load_host_state(to_vmx(vcpu)); +	__vmx_load_host_state(to_vmx(vcpu));  }  static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)  	switch (msr_index) {  #ifdef CONFIG_X86_64  	case MSR_EFER: +		vmx_load_host_state(vmx);  		ret = kvm_set_msr_common(vcpu, msr_index, data); -		if (vmx->host_state.loaded) { -			reload_host_efer(vmx); -			load_transition_efer(vmx); -		}  		break;  	case MSR_FS_BASE:  		vmcs_writel(GUEST_FS_BASE, data); @@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)  		guest_write_tsc(data);  		break;  	default: +		vmx_load_host_state(vmx);  		msr = find_msr_entry(vmx, msr_index);  		if (msr) {  			msr->data = data; -			if (vmx->host_state.loaded) -				load_msrs(vmx->guest_msrs, vmx->save_nmsrs);  			break;  		}  		ret = kvm_set_msr_common(vcpu, msr_index, data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00acf1301a15..63a77caa59f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)  static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)  {  	static int version; -	struct kvm_wall_clock wc; -	struct timespec wc_ts; +	struct pvclock_wall_clock wc; +	struct timespec now, sys, boot;  	if (!wall_clock)  		return; @@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)  	kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); -	wc_ts = current_kernel_time(); -	wc.wc_sec = wc_ts.tv_sec; -	wc.wc_nsec = wc_ts.tv_nsec; -	wc.wc_version = version; +	/* +	 * The guest calculates current wall clock time by adding +	 * system time (updated by kvm_write_guest_time below) to the +	 * wall clock specified here.  guest system time equals host +	 * system time for us, thus we must fill in host boot time here. +	 */ +	now = current_kernel_time(); +	ktime_get_ts(&sys); +	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + +	wc.sec = boot.tv_sec; +	wc.nsec = boot.tv_nsec; +	wc.version = version;  	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)  	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));  } +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ +	uint32_t quotient, remainder; + +	/* Don't try to replace with do_div(), this one calculates +	 * "(dividend << 32) / divisor" */ +	__asm__ ( "divl %4" +		  : "=a" (quotient), "=d" (remainder) +		  : "0" (0), "1" (dividend), "r" (divisor) ); +	return quotient; +} + +static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +{ +	uint64_t nsecs = 1000000000LL; +	int32_t  shift = 0; +	uint64_t tps64; +	uint32_t tps32; + +	tps64 = tsc_khz * 1000LL; +	while (tps64 > nsecs*2) { +		tps64 >>= 1; +		shift--; +	} + +	tps32 = (uint32_t)tps64; +	while (tps32 <= (uint32_t)nsecs) { +		tps32 <<= 1; +		shift++; +	} + +	hv_clock->tsc_shift = shift; +	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + +	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", +		 __FUNCTION__, tsc_khz, hv_clock->tsc_shift, +		 hv_clock->tsc_to_system_mul); +} +  static void kvm_write_guest_time(struct kvm_vcpu *v)  {  	struct timespec ts; @@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)  	if ((!vcpu->time_page))  		return; +	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { +		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); +		vcpu->hv_clock_tsc_khz = tsc_khz; +	} +  	/* Keep irq disabled to prevent changes to the clock */  	local_irq_save(flags);  	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, @@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)  	/*  	 * The interface expects us to write an even number signaling that the  	 * update is finished. Since the guest won't see the intermediate -	 * state, we just write "2" at the end +	 * state, we just increase by 2 at the end.  	 */ -	vcpu->hv_clock.version = 2; +	vcpu->hv_clock.version += 2;  	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);  	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, -		sizeof(vcpu->hv_clock)); +	       sizeof(vcpu->hv_clock));  	kunmap_atomic(shared_kaddr, KM_USER0); @@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		/* ...but clean it before doing the actual write */  		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); -		vcpu->arch.hv_clock.tsc_to_system_mul = -					clocksource_khz2mult(tsc_khz, 22); -		vcpu->arch.hv_clock.tsc_shift = 22; -  		down_read(¤t->mm->mmap_sem);  		vcpu->arch.time_page =  				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); @@ -2759,6 +2808,8 @@ again:  	if (vcpu->requests) {  		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))  			__kvm_migrate_timers(vcpu); +		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) +			kvm_x86_ops->tlb_flush(vcpu);  		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,  				       &vcpu->requests)) {  			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -2772,6 +2823,7 @@ again:  		}  	} +	clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);  	kvm_inject_pending_timer_irqs(vcpu);  	preempt_disable(); @@ -2781,21 +2833,13 @@ again:  	local_irq_disable(); -	if (need_resched()) { +	if (vcpu->requests || need_resched()) {  		local_irq_enable();  		preempt_enable();  		r = 1;  		goto out;  	} -	if (vcpu->requests) -		if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { -			local_irq_enable(); -			preempt_enable(); -			r = 1; -			goto out; -		} -  	if (signal_pending(current)) {  		local_irq_enable();  		preempt_enable(); @@ -2825,9 +2869,6 @@ again:  	kvm_guest_enter(); -	if (vcpu->requests) -		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) -			kvm_x86_ops->tlb_flush(vcpu);  	KVMTRACE_0D(VMENTRY, vcpu, entryexit);  	kvm_x86_ops->run(vcpu, kvm_run); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..6c388e593bc8 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,8 +5,9 @@  config XEN  	bool "Xen guest support"  	select PARAVIRT +	select PARAVIRT_CLOCK  	depends on X86_32 -	depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) +	depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)  	help  	  This is the Linux Xen port.  Enabling this will allow the  	  kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e457d61..f09c1c69c37a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)  static __init void xen_pagetable_setup_start(pgd_t *base)  {  	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; +	int i;  	/* special set_pte for pagetable initialization */  	pv_mmu_ops.set_pte = xen_set_pte_init;  	init_mm.pgd = base;  	/* -	 * copy top-level of Xen-supplied pagetable into place.	 For -	 * !PAE we can use this as-is, but for PAE it is a stand-in -	 * while we copy the pmd pages. +	 * copy top-level of Xen-supplied pagetable into place.  This +	 * is a stand-in while we copy the pmd pages.  	 */  	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); -	if (PTRS_PER_PMD > 1) { -		int i; -		/* -		 * For PAE, need to allocate new pmds, rather than -		 * share Xen's, since Xen doesn't like pmd's being -		 * shared between address spaces. -		 */ -		for (i = 0; i < PTRS_PER_PGD; i++) { -			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { -				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); +	/* +	 * For PAE, need to allocate new pmds, rather than +	 * share Xen's, since Xen doesn't like pmd's being +	 * shared between address spaces. +	 */ +	for (i = 0; i < PTRS_PER_PGD; i++) { +		if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { +			pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); -				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), -				       PAGE_SIZE); +			memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), +			       PAGE_SIZE); -				make_lowmem_page_readonly(pmd); +			make_lowmem_page_readonly(pmd); -				set_pgd(&base[i], __pgd(1 + __pa(pmd))); -			} else -				pgd_clear(&base[i]); -		} +			set_pgd(&base[i], __pgd(1 + __pa(pmd))); +		} else +			pgd_clear(&base[i]);  	}  	/* make sure zero_page is mapped RO so we can use it in pagetables */ @@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)  	/* Actually pin the pagetable down, but we can't set PG_pinned  	   yet because the page structures don't exist yet. */ -	{ -		unsigned level; - -#ifdef CONFIG_X86_PAE -		level = MMUEXT_PIN_L3_TABLE; -#else -		level = MMUEXT_PIN_L2_TABLE; -#endif - -		pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); -	} +	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));  }  /* This is called once we have the cpu_possible_map */ @@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {  	.make_pte = xen_make_pte,  	.make_pgd = xen_make_pgd, -#ifdef CONFIG_X86_PAE  	.set_pte_atomic = xen_set_pte_atomic,  	.set_pte_present = xen_set_pte_at,  	.set_pud = xen_set_pud, @@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {  	.make_pmd = xen_make_pmd,  	.pmd_val = xen_pmd_val, -#endif	/* PAE */  	.activate_mm = xen_activate_mm,  	.dup_mmap = xen_dup_mmap, @@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)  	if (xen_feature(XENFEAT_supervisor_mode_kernel))  		pv_info.kernel_rpl = 0; +	/* Prevent unwanted bits from being set in PTEs. */ +	__supported_pte_mask &= ~_PAGE_GLOBAL; +	if (!is_initial_xendomain()) +		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); +  	/* set the limit of our address space */  	xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3525ef523a74..df40bf74ea75 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -179,50 +179,56 @@ out:  		preempt_enable();  } -pteval_t xen_pte_val(pte_t pte) +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) +{ +	if (val & _PAGE_PRESENT) { +		unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; +		pteval_t flags = val & ~PTE_MASK; +		val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; +	} + +	return val; +} + +static pteval_t pte_pfn_to_mfn(pteval_t val)  { -	pteval_t ret = pte.pte; +	if (val & _PAGE_PRESENT) { +		unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; +		pteval_t flags = val & ~PTE_MASK; +		val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; +	} -	if (ret & _PAGE_PRESENT) -		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; +	return val; +} -	return ret; +pteval_t xen_pte_val(pte_t pte) +{ +	return pte_mfn_to_pfn(pte.pte);  }  pgdval_t xen_pgd_val(pgd_t pgd)  { -	pgdval_t ret = pgd.pgd; -	if (ret & _PAGE_PRESENT) -		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; -	return ret; +	return pte_mfn_to_pfn(pgd.pgd);  }  pte_t xen_make_pte(pteval_t pte)  { -	if (pte & _PAGE_PRESENT) { -		pte = phys_to_machine(XPADDR(pte)).maddr; -		pte &= ~(_PAGE_PCD | _PAGE_PWT); -	} - -	return (pte_t){ .pte = pte }; +	pte = pte_pfn_to_mfn(pte); +	return native_make_pte(pte);  }  pgd_t xen_make_pgd(pgdval_t pgd)  { -	if (pgd & _PAGE_PRESENT) -		pgd = phys_to_machine(XPADDR(pgd)).maddr; - -	return (pgd_t){ pgd }; +	pgd = pte_pfn_to_mfn(pgd); +	return native_make_pgd(pgd);  }  pmdval_t xen_pmd_val(pmd_t pmd)  { -	pmdval_t ret = native_pmd_val(pmd); -	if (ret & _PAGE_PRESENT) -		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; -	return ret; +	return pte_mfn_to_pfn(pmd.pmd);  } -#ifdef CONFIG_X86_PAE +  void xen_set_pud(pud_t *ptr, pud_t val)  {  	struct multicall_space mcs; @@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)  pmd_t xen_make_pmd(pmdval_t pmd)  { -	if (pmd & _PAGE_PRESENT) -		pmd = phys_to_machine(XPADDR(pmd)).maddr; - +	pmd = pte_pfn_to_mfn(pmd);  	return native_make_pmd(pmd);  } -#else  /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ -	*ptep = pte; -} -#endif	/* CONFIG_X86_PAE */  /*    (Yet another) pagetable walker.  This one is intended for pinning a @@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)     read-only, and can be pinned. */  void xen_pgd_pin(pgd_t *pgd)  { -	unsigned level; -  	xen_mc_batch();  	if (pgd_walk(pgd, pin_page, TASK_SIZE)) { @@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)  		xen_mc_batch();  	} -#ifdef CONFIG_X86_PAE -	level = MMUEXT_PIN_L3_TABLE; -#else -	level = MMUEXT_PIN_L2_TABLE; -#endif - -	xen_do_pin(level, PFN_DOWN(__pa(pgd))); - +	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));  	xen_mc_issue(0);  } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b1519d..5fe961caffd4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);  void xen_pgd_pin(pgd_t *pgd);  //void xen_pgd_unpin(pgd_t *pgd); -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); +pteval_t xen_pte_val(pte_t); +pmdval_t xen_pmd_val(pmd_t); +pgdval_t xen_pgd_val(pgd_t); -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); +pte_t xen_make_pte(pteval_t); +pmd_t xen_make_pmd(pmdval_t); +pgd_t xen_make_pgd(pgdval_t);  void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,  		    pte_t *ptep, pte_t pteval); @@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);  void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);  void xen_pmd_clear(pmd_t *pmdp); - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif -  #endif	/* _XEN_MMU_H */ diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 52b2e3856980..41e217503c96 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,7 @@  #include <linux/kernel_stat.h>  #include <linux/math64.h> +#include <asm/pvclock.h>  #include <asm/xen/hypervisor.h>  #include <asm/xen/hypercall.h> @@ -31,17 +32,6 @@  static cycle_t xen_clocksource_read(void); -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { -	u64 tsc_timestamp;     /* TSC at last update of time vals.  */ -	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */ -	u32 tsc_to_nsec_mul; -	int tsc_shift; -	u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); -  /* runstate info updated by Xen */  static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)  unsigned long xen_cpu_khz(void)  {  	u64 xen_khz = 1000000ULL << 32; -	const struct vcpu_time_info *info = +	const struct pvclock_vcpu_time_info *info =  		&HYPERVISOR_shared_info->vcpu_info[0].time;  	do_div(xen_khz, info->tsc_to_system_mul); @@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)  	return xen_khz;  } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ -	struct vcpu_time_info   *src; -	struct shadow_time_info *dst; - -	/* src is shared memory with the hypervisor, so we need to -	   make sure we get a consistent snapshot, even in the face of -	   being preempted. */ -	src = &__get_cpu_var(xen_vcpu)->time; -	dst = &__get_cpu_var(shadow_time); - -	do { -		dst->version = src->version; -		rmb();		/* fetch version before data */ -		dst->tsc_timestamp     = src->tsc_timestamp; -		dst->system_timestamp  = src->system_time; -		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul; -		dst->tsc_shift         = src->tsc_shift; -		rmb();		/* test version after fetching data */ -	} while ((src->version & 1) | (dst->version ^ src->version)); - -	return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ -	u64 product; -#ifdef __i386__ -	u32 tmp1, tmp2; -#endif - -	if (shift < 0) -		delta >>= -shift; -	else -		delta <<= shift; - -#ifdef __i386__ -	__asm__ ( -		"mul  %5       ; " -		"mov  %4,%%eax ; " -		"mov  %%edx,%4 ; " -		"mul  %5       ; " -		"xor  %5,%5    ; " -		"add  %4,%%eax ; " -		"adc  %5,%%edx ; " -		: "=A" (product), "=r" (tmp1), "=r" (tmp2) -		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ -	__asm__ ( -		"mul %%rdx ; shrd $32,%%rdx,%%rax" -		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - -	return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ -	u64 now, delta; -	now = native_read_tsc(); -	delta = now - shadow->tsc_timestamp; -	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} -  static cycle_t xen_clocksource_read(void)  { -	struct shadow_time_info *shadow = &get_cpu_var(shadow_time); +        struct pvclock_vcpu_time_info *src;  	cycle_t ret; -	unsigned version; - -	do { -		version = get_time_values_from_xen(); -		barrier(); -		ret = shadow->system_timestamp + get_nsec_offset(shadow); -		barrier(); -	} while (version != __get_cpu_var(xen_vcpu)->time.version); - -	put_cpu_var(shadow_time); +	src = &get_cpu_var(xen_vcpu)->time; +	ret = pvclock_clocksource_read(src); +	put_cpu_var(xen_vcpu);  	return ret;  }  static void xen_read_wallclock(struct timespec *ts)  { -	const struct shared_info *s = HYPERVISOR_shared_info; -	u32 version; -	u64 delta; -	struct timespec now; - -	/* get wallclock at system boot */ -	do { -		version = s->wc_version; -		rmb();		/* fetch version before time */ -		now.tv_sec  = s->wc_sec; -		now.tv_nsec = s->wc_nsec; -		rmb();		/* fetch time before checking version */ -	} while ((s->wc_version & 1) | (version ^ s->wc_version)); +	struct shared_info *s = HYPERVISOR_shared_info; +	struct pvclock_wall_clock *wall_clock = &(s->wc); +        struct pvclock_vcpu_time_info *vcpu_time; -	delta = xen_clocksource_read();	/* time since system boot */ -	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; - -	now.tv_nsec = do_div(delta, NSEC_PER_SEC); -	now.tv_sec = delta; - -	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +	vcpu_time = &get_cpu_var(xen_vcpu)->time; +	pvclock_read_wallclock(wall_clock, vcpu_time, ts); +	put_cpu_var(xen_vcpu);  }  unsigned long xen_get_wallclock(void) @@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)  	struct timespec ts;  	xen_read_wallclock(&ts); -  	return ts.tv_sec;  } @@ -569,8 +463,6 @@ __init void xen_time_init(void)  {  	int cpu = smp_processor_id(); -	get_time_values_from_xen(); -  	clocksource_register(&xen_clocksource);  	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587ce73c..6ec3b4f7719b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -17,7 +17,7 @@ ENTRY(startup_xen)  	__FINIT -.pushsection .bss.page_aligned +.pushsection .text  	.align PAGE_SIZE_asm  ENTRY(hypercall_page)  	.skip 0x1000 @@ -30,11 +30,7 @@ ENTRY(hypercall_page)  	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)  	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)  	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE  	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes") -#else -	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no") -#endif  	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")  #endif /*CONFIG_XEN */ diff --git a/drivers/char/drm/i915_drv.c b/drivers/char/drm/i915_drv.c index e8f3d682e3b1..93aed1c38bd2 100644 --- a/drivers/char/drm/i915_drv.c +++ b/drivers/char/drm/i915_drv.c @@ -389,6 +389,7 @@ static int i915_resume(struct drm_device *dev)  	pci_restore_state(dev->pdev);  	if (pci_enable_device(dev->pdev))  		return -1; +	pci_set_master(dev->pdev);  	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB); diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b1a757a5ee27..8f81139d6194 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush);  int n_tty_ioctl(struct tty_struct *tty, struct file *file,  		       unsigned int cmd, unsigned long arg)  { -	struct tty_struct *real_tty;  	unsigned long flags;  	int retval; -	if (tty->driver->type == TTY_DRIVER_TYPE_PTY && -	    tty->driver->subtype == PTY_TYPE_MASTER) -		real_tty = tty->link; -	else -		real_tty = tty; -  	switch (cmd) {  	case TCXONC:  		retval = tty_check_change(tty); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index b224079d4e1f..d5862e5d99a0 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m  {  	struct page *page; -	page = alloc_pages(gfp_mask, order); +	/* +	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are +	 * cleared, and subtle failures are seen if they aren't. +	 */ +	page = alloc_pages(gfp_mask | __GFP_ZERO, order);  	if (!page)  		return -ENOMEM; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 5126d5d9ea0e..2e554a4ab337 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  	 * we set it now, so we can trap and pass that trap to the Guest if it  	 * uses the FPU. */  	if (cpu->ts) -		lguest_set_ts(); +		unlazy_fpu(current);  	/* SYSENTER is an optimized way of doing system calls.  We can't allow  	 * it because it always jumps to privilege level 0.  A normal Guest @@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  	 * trap made the switcher code come back, and an error code which some  	 * traps set.  */ +	 /* Restore SYSENTER if it's supposed to be on. */ +	 if (boot_cpu_has(X86_FEATURE_SEP)) +		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); +  	/* If the Guest page faulted, then the cr2 register will tell us the  	 * bad virtual address.  We have to grab this now, because once we  	 * re-enable interrupts an interrupt could fault and thus overwrite @@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  	if (cpu->regs->trapnum == 14)  		cpu->arch.last_pagefault = read_cr2();  	/* Similarly, if we took a trap because the Guest used the FPU, -	 * we have to restore the FPU it expects to see. */ +	 * we have to restore the FPU it expects to see. +	 * math_state_restore() may sleep and we may even move off to +	 * a different CPU. So all the critical stuff should be done +	 * before this.  */  	else if (cpu->regs->trapnum == 7)  		math_state_restore(); - -	/* Restore SYSENTER if it's supposed to be on. */ -	if (boot_cpu_has(X86_FEATURE_SEP)) -		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);  }  /*H:130 Now we've examined the hypercall code; our Guest can make requests. diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 8662a6b7a30b..25b352b664d9 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o  obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o  obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o  obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o -CFLAGS_hpwdt.o += -O  obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o  obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o  obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4f0f22b020ea..76e5b7386af9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)  #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */  		/* Clear master flag /before/ clearing selector flag. */ -		rmb(); +		wmb();  #endif  		pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);  		while (pending_words != 0) { diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c19184f2e70e..bec76b1c2bb0 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,  } -static inline unsigned int zero_metapath_length(const struct metapath *mp, -						unsigned height) +static inline unsigned int metapath_branch_start(const struct metapath *mp)  { -	unsigned int i; -	for (i = 0; i < height - 1; i++) { -		if (mp->mp_list[i] != 0) -			return i; -	} -	return height; +	if (mp->mp_list[0] == 0) +		return 2; +	return 1;  }  /** @@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,  	struct gfs2_sbd *sdp = GFS2_SB(inode);  	struct buffer_head *dibh = mp->mp_bh[0];  	u64 bn, dblock = 0; -	unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; +	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;  	unsigned dblks = 0;  	unsigned ptrs_per_blk;  	const unsigned end_of_metadata = height - 1; @@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,  			/* Building up tree height */  			state = ALLOC_GROW_HEIGHT;  			iblks = height - ip->i_height; -			zmpl = zero_metapath_length(mp, height); -			iblks -= zmpl; -			iblks += height; +			branch_start = metapath_branch_start(mp); +			iblks += (height - branch_start);  		}  	} @@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,  					sizeof(struct gfs2_meta_header));  				*ptr = zero_bn;  				state = ALLOC_GROW_DEPTH; -				for(i = zmpl; i < height; i++) { +				for(i = branch_start; i < height; i++) {  					if (mp->mp_bh[i] == NULL)  						break;  					brelse(mp->mp_bh[i]);  					mp->mp_bh[i] = NULL;  				} -				i = zmpl; +				i = branch_start;  			}  			if (n == 0)  				break; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6387523a3153..3401628d742b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -195,7 +195,7 @@ ulong_aligned:  	   depending on architecture.  I've experimented with several ways  	   of writing this section such as using an else before the goto  	   but this one seems to be the fastest. */ -	while ((unsigned char *)plong < end - 1) { +	while ((unsigned char *)plong < end - sizeof(unsigned long)) {  		prefetch(plong + 1);  		if (((*plong) & LBITMASK) != lskipval)  			break; diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502cc..779d2eb649c5 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,  				struct mnt_fhstatus *res)  {  	struct nfs_fh *fh = res->fh; +	unsigned size;  	if ((res->status = ntohl(*p++)) == 0) { -		int size = ntohl(*p++); -		if (size <= NFS3_FHSIZE) { +		size = ntohl(*p++); +		if (size <= NFS3_FHSIZE && size != 0) {  			fh->size = size;  			memcpy(fh->data, p, size);  		} else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2a4a024a4e7b..614efeed5437 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options,  {  	struct nfs_mount_data *data = (struct nfs_mount_data *)options; -	memset(args, 0, sizeof(*args)); -  	if (data == NULL)  		goto out_no_data; @@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options,  	case 5:  		memset(data->context, 0, sizeof(data->context));  	case 6: -		if (data->flags & NFS_MOUNT_VER3) +		if (data->flags & NFS_MOUNT_VER3) { +			if (data->root.size > NFS3_FHSIZE || data->root.size == 0) +				goto out_invalid_fh;  			mntfh->size = data->root.size; -		else +		} else  			mntfh->size = NFS2_FHSIZE; -		if (mntfh->size > sizeof(mntfh->data)) -			goto out_invalid_fh;  		memcpy(mntfh->data, data->root.data, mntfh->size);  		if (mntfh->size < sizeof(mntfh->data)) @@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,  {  	struct nfs_server *server = NULL;  	struct super_block *s; -	struct nfs_fh mntfh; -	struct nfs_parsed_mount_data data; +	struct nfs_parsed_mount_data *data; +	struct nfs_fh *mntfh;  	struct dentry *mntroot;  	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;  	struct nfs_sb_mountdata sb_mntdata = {  		.mntflags = flags,  	}; -	int error; +	int error = -ENOMEM; + +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); +	if (data == NULL || mntfh == NULL) +		goto out_free_fh; -	security_init_mnt_opts(&data.lsm_opts); +	security_init_mnt_opts(&data->lsm_opts);  	/* Validate the mount data */ -	error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); +	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);  	if (error < 0)  		goto out;  	/* Get a volume representation */ -	server = nfs_create_server(&data, &mntfh); +	server = nfs_create_server(data, mntfh);  	if (IS_ERR(server)) {  		error = PTR_ERR(server);  		goto out; @@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,  	if (!s->s_root) {  		/* initial superblock/root creation */ -		nfs_fill_super(s, &data); +		nfs_fill_super(s, data);  	} -	mntroot = nfs_get_root(s, &mntfh); +	mntroot = nfs_get_root(s, mntfh);  	if (IS_ERR(mntroot)) {  		error = PTR_ERR(mntroot);  		goto error_splat_super;  	} -	error = security_sb_set_mnt_opts(s, &data.lsm_opts); +	error = security_sb_set_mnt_opts(s, &data->lsm_opts);  	if (error)  		goto error_splat_root; @@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,  	error = 0;  out: -	kfree(data.nfs_server.hostname); -	kfree(data.mount_server.hostname); -	security_free_mnt_opts(&data.lsm_opts); +	kfree(data->nfs_server.hostname); +	kfree(data->mount_server.hostname); +	security_free_mnt_opts(&data->lsm_opts); +out_free_fh: +	kfree(mntfh); +	kfree(data);  	return error;  out_err_nosb: @@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options,  	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;  	char *c; -	memset(args, 0, sizeof(*args)); -  	if (data == NULL)  		goto out_no_data; @@ -1959,26 +1963,31 @@ out_no_client_address:  static int nfs4_get_sb(struct file_system_type *fs_type,  	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)  { -	struct nfs_parsed_mount_data data; +	struct nfs_parsed_mount_data *data;  	struct super_block *s;  	struct nfs_server *server; -	struct nfs_fh mntfh; +	struct nfs_fh *mntfh;  	struct dentry *mntroot;  	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;  	struct nfs_sb_mountdata sb_mntdata = {  		.mntflags = flags,  	}; -	int error; +	int error = -ENOMEM; -	security_init_mnt_opts(&data.lsm_opts); +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); +	if (data == NULL || mntfh == NULL) +		goto out_free_fh; + +	security_init_mnt_opts(&data->lsm_opts);  	/* Validate the mount data */ -	error = nfs4_validate_mount_data(raw_data, &data, dev_name); +	error = nfs4_validate_mount_data(raw_data, data, dev_name);  	if (error < 0)  		goto out;  	/* Get a volume representation */ -	server = nfs4_create_server(&data, &mntfh); +	server = nfs4_create_server(data, mntfh);  	if (IS_ERR(server)) {  		error = PTR_ERR(server);  		goto out; @@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,  		nfs4_fill_super(s);  	} -	mntroot = nfs4_get_root(s, &mntfh); +	mntroot = nfs4_get_root(s, mntfh);  	if (IS_ERR(mntroot)) {  		error = PTR_ERR(mntroot);  		goto error_splat_super;  	} -	error = security_sb_set_mnt_opts(s, &data.lsm_opts); +	error = security_sb_set_mnt_opts(s, &data->lsm_opts);  	if (error)  		goto error_splat_root; @@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,  	error = 0;  out: -	kfree(data.client_address); -	kfree(data.nfs_server.export_path); -	kfree(data.nfs_server.hostname); -	security_free_mnt_opts(&data.lsm_opts); +	kfree(data->client_address); +	kfree(data->nfs_server.export_path); +	kfree(data->nfs_server.hostname); +	security_free_mnt_opts(&data->lsm_opts); +out_free_fh: +	kfree(mntfh); +	kfree(data);  	return error;  out_free: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6d8ace3e3259..f333848fd3be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page,  	}  	status = nfs_writepage_setup(ctx, page, offset, count); -	__set_page_dirty_nobuffers(page); +	if (status < 0) +		nfs_set_pageerror(page); +	else +		__set_page_dirty_nobuffers(page);          dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",  			status, (long long)i_size_read(inode)); -	if (status < 0) -		nfs_set_pageerror(page);  	return status;  } diff --git a/fs/select.c b/fs/select.c index 8dda969614a9..da0e88201c3a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)  						retval++;  					}  				} -				cond_resched();  			}  			if (res_in)  				*rinp = res_in; @@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)  				*routp = res_out;  			if (res_ex)  				*rexp = res_ex; +			cond_resched();  		}  		wait = NULL;  		if (retval || !*timeout || signal_pending(current)) diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 82e8a94b4b2f..3495e8e00d70 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS];  #define __get_cpu_var(var)		per_cpu_var(var)  #define __raw_get_cpu_var(var)		per_cpu_var(var) +#define PER_CPU_ATTRIBUTES +  #endif /* SMP */  #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1d8cd01fa514..844f2a89afbc 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -18,6 +18,7 @@  #include <linux/kvm_para.h>  #include <linux/kvm_types.h> +#include <asm/pvclock-abi.h>  #include <asm/desc.h>  #define KVM_MAX_VCPUS 16 @@ -282,7 +283,8 @@ struct kvm_vcpu_arch {  	struct x86_emulate_ctxt emulate_ctxt;  	gpa_t time; -	struct kvm_vcpu_time_info hv_clock; +	struct pvclock_vcpu_time_info hv_clock; +	unsigned int hv_clock_tsc_khz;  	unsigned int time_offset;  	struct page *time_page;  }; diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 509845942070..bfd9900742bf 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt {  #ifdef __KERNEL__  #include <asm/processor.h> -/* xen binary-compatible interface. See xen headers for details */ -struct kvm_vcpu_time_info { -	uint32_t version; -	uint32_t pad0; -	uint64_t tsc_timestamp; -	uint64_t system_time; -	uint32_t tsc_to_system_mul; -	int8_t   tsc_shift; -	int8_t	 pad[3]; -} __attribute__((__packed__)); /* 32 bytes */ - -struct kvm_wall_clock { -	uint32_t wc_version; -	uint32_t wc_sec; -	uint32_t wc_nsec; -} __attribute__((__packed__)); - -  extern void kvmclock_init(void); diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h new file mode 100644 index 000000000000..6857f840b243 --- /dev/null +++ b/include/asm-x86/pvclock-abi.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_PVCLOCK_ABI_H_ +#define _ASM_X86_PVCLOCK_ABI_H_ +#ifndef __ASSEMBLY__ + +/* + * These structs MUST NOT be changed. + * They are the ABI between hypervisor and guest OS. + * Both Xen and KVM are using this. + * + * pvclock_vcpu_time_info holds the system time and the tsc timestamp + * of the last update. So the guest can use the tsc delta to get a + * more precise system time.  There is one per virtual cpu. + * + * pvclock_wall_clock references the point in time when the system + * time was zero (usually boot time), thus the guest calculates the + * current wall clock by adding the system time. + * + * Protocol for the "version" fields is: hypervisor raises it (making + * it uneven) before it starts updating the fields and raises it again + * (making it even) when it is done.  Thus the guest can make sure the + * time values it got are consistent by checking the version before + * and after reading them. + */ + +struct pvclock_vcpu_time_info { +	u32   version; +	u32   pad0; +	u64   tsc_timestamp; +	u64   system_time; +	u32   tsc_to_system_mul; +	s8    tsc_shift; +	u8    pad[3]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { +	u32   version; +	u32   sec; +	u32   nsec; +} __attribute__((__packed__)); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PVCLOCK_ABI_H_ */ diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 000000000000..85b1bba8e0a3 --- /dev/null +++ b/include/asm-x86/pvclock.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_PVCLOCK_H_ +#define _ASM_X86_PVCLOCK_H_ + +#include <linux/clocksource.h> +#include <asm/pvclock-abi.h> + +/* some helper functions for xen and kvm pv clock sources */ +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +void pvclock_read_wallclock(struct pvclock_wall_clock *wall, +			    struct pvclock_vcpu_time_info *vcpu, +			    struct timespec *ts); + +#endif /* _ASM_X86_PVCLOCK_H_ */ diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index baf3a4dce28c..e11f24038b1d 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x)  	return (pte_t) { .pte = x };  } -#ifdef CONFIG_X86_PAE  #define pmd_val_ma(v) ((v).pmd)  #define pud_val_ma(v) ((v).pgd.pgd)  #define __pmd_ma(x)	((pmd_t) { (x) } ) -#else  /* !X86_PAE */ -#define pmd_val_ma(v)	((v).pud.pgd.pgd) -#endif	/* CONFIG_X86_PAE */  #define pgd_val_ma(x)	((x).pgd) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 092b1b25291d..de9d1df4bba2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -33,6 +33,7 @@  #define KVM_REQ_REPORT_TPR_ACCESS  2  #define KVM_REQ_MMU_RELOAD         3  #define KVM_REQ_TRIPLE_FAULT       4 +#define KVM_REQ_PENDING_TIMER      5  struct kvm_vcpu;  extern struct kmem_cache *kvm_vcpu_cache; diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 59f1c0bd8f9c..d2a003586761 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -27,8 +27,7 @@   * 	This routine is called by the kernel to write a series of   * 	characters to the tty device.  The characters may come from   * 	user space or kernel space.  This routine will return the - *	number of characters actually accepted for writing.  This - *	routine is mandatory. + *	number of characters actually accepted for writing.   *   *	Optional: Required for writable devices.   * @@ -134,7 +133,7 @@   * 	This routine notifies the tty driver that it should hangup the   * 	tty device.   * - *	Required: + *	Optional:   *   * void (*break_ctl)(struct tty_stuct *tty, int state);   * diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 9b018da48cf3..819a0331cda9 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -10,6 +10,7 @@  #define __XEN_PUBLIC_XEN_H__  #include <asm/xen/interface.h> +#include <asm/pvclock-abi.h>  /*   * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). @@ -336,7 +337,7 @@ struct vcpu_info {  	uint8_t evtchn_upcall_mask;  	unsigned long evtchn_pending_sel;  	struct arch_vcpu_info arch; -	struct vcpu_time_info time; +	struct pvclock_vcpu_time_info time;  }; /* 64 bytes (x86) */  /* @@ -384,9 +385,7 @@ struct shared_info {  	 * Wallclock time: updated only by control software. Guests should base  	 * their gettimeofday() syscall on this wallclock-base value.  	 */ -	uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */ -	uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */ -	uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */ +	struct pvclock_wall_clock wc;  	struct arch_shared_info arch; diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074fe..7d1136e97c14 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q)   * private futexes.   */  static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -				struct task_struct *newowner) +				struct task_struct *newowner, +				struct rw_semaphore *fshared)  {  	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;  	struct futex_pi_state *pi_state = q->pi_state; +	struct task_struct *oldowner = pi_state->owner;  	u32 uval, curval, newval; -	int ret; +	int ret, attempt = 0;  	/* Owner died? */ +	if (!pi_state->owner) +		newtid |= FUTEX_OWNER_DIED; + +	/* +	 * We are here either because we stole the rtmutex from the +	 * pending owner or we are the pending owner which failed to +	 * get the rtmutex. We have to replace the pending owner TID +	 * in the user space variable. This must be atomic as we have +	 * to preserve the owner died bit here. +	 * +	 * Note: We write the user space value _before_ changing the +	 * pi_state because we can fault here. Imagine swapped out +	 * pages or a fork, which was running right before we acquired +	 * mmap_sem, that marked all the anonymous memory readonly for +	 * cow. +	 * +	 * Modifying pi_state _before_ the user space value would +	 * leave the pi_state in an inconsistent state when we fault +	 * here, because we need to drop the hash bucket lock to +	 * handle the fault. This might be observed in the PID check +	 * in lookup_pi_state. +	 */ +retry: +	if (get_futex_value_locked(&uval, uaddr)) +		goto handle_fault; + +	while (1) { +		newval = (uval & FUTEX_OWNER_DIED) | newtid; + +		curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + +		if (curval == -EFAULT) +			goto handle_fault; +		if (curval == uval) +			break; +		uval = curval; +	} + +	/* +	 * We fixed up user space. Now we need to fix the pi_state +	 * itself. +	 */  	if (pi_state->owner != NULL) {  		spin_lock_irq(&pi_state->owner->pi_lock);  		WARN_ON(list_empty(&pi_state->list));  		list_del_init(&pi_state->list);  		spin_unlock_irq(&pi_state->owner->pi_lock); -	} else -		newtid |= FUTEX_OWNER_DIED; +	}  	pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,  	WARN_ON(!list_empty(&pi_state->list));  	list_add(&pi_state->list, &newowner->pi_state_list);  	spin_unlock_irq(&newowner->pi_lock); +	return 0;  	/* -	 * We own it, so we have to replace the pending owner -	 * TID. This must be atomic as we have preserve the -	 * owner died bit here. +	 * To handle the page fault we need to drop the hash bucket +	 * lock here. That gives the other task (either the pending +	 * owner itself or the task which stole the rtmutex) the +	 * chance to try the fixup of the pi_state. So once we are +	 * back from handling the fault we need to check the pi_state +	 * after reacquiring the hash bucket lock and before trying to +	 * do another fixup. When the fixup has been done already we +	 * simply return.  	 */ -	ret = get_futex_value_locked(&uval, uaddr); +handle_fault: +	spin_unlock(q->lock_ptr); -	while (!ret) { -		newval = (uval & FUTEX_OWNER_DIED) | newtid; +	ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); -		curval = cmpxchg_futex_value_locked(uaddr, uval, newval); +	spin_lock(q->lock_ptr); -		if (curval == -EFAULT) -			ret = -EFAULT; -		if (curval == uval) -			break; -		uval = curval; -	} -	return ret; +	/* +	 * Check if someone else fixed it for us: +	 */ +	if (pi_state->owner != oldowner) +		return 0; + +	if (ret) +		return ret; + +	goto retry;  }  /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,  		 * that case:  		 */  		if (q.pi_state->owner != curr) -			ret = fixup_pi_state_owner(uaddr, &q, curr); +			ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);  	} else {  		/*  		 * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,  				int res;  				owner = rt_mutex_owner(&q.pi_state->pi_mutex); -				res = fixup_pi_state_owner(uaddr, &q, owner); +				res = fixup_pi_state_owner(uaddr, &q, owner, +							   fshared);  				/* propagate -EFAULT, if the fixup failed */  				if (res) diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 79e3c90113c2..3ec23c3ec97f 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs)  	return 1;  } -void kgdb_console_write(struct console *co, const char *s, unsigned count) +static void kgdb_console_write(struct console *co, const char *s, +   unsigned count)  {  	unsigned long flags; diff --git a/kernel/sched.c b/kernel/sched.c index 33680bc17cf4..56958359d20c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4398,22 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)  			     signal_pending(current)) ||  			    (state == TASK_KILLABLE &&  			     fatal_signal_pending(current))) { -				__remove_wait_queue(&x->wait, &wait); -				return -ERESTARTSYS; +				timeout = -ERESTARTSYS; +				break;  			}  			__set_current_state(state);  			spin_unlock_irq(&x->wait.lock);  			timeout = schedule_timeout(timeout);  			spin_lock_irq(&x->wait.lock); -			if (!timeout) { -				__remove_wait_queue(&x->wait, &wait); -				return timeout; -			} -		} while (!x->done); +		} while (!x->done && timeout);  		__remove_wait_queue(&x->wait, &wait); +		if (!x->done) +			return timeout;  	}  	x->done--; -	return timeout; +	return timeout ?: 1;  }  static long __sched diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1dad5bbb59b6..0f3c19197fa4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)  			if (rt_rq->rt_time || rt_rq->rt_nr_running)  				idle = 0;  			spin_unlock(&rt_rq->rt_runtime_lock); -		} +		} else if (rt_rq->rt_nr_running) +			idle = 0;  		if (enqueue)  			sched_rt_rq_enqueue(rt_rq); diff --git a/mm/memory.c b/mm/memory.c index 9aefaae46858..d14b251a25a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1045,6 +1045,26 @@ no_page_table:  	return page;  } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ +	/* +	 * We don't want to optimize FOLL_ANON for make_pages_present() +	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED, +	 * we want to get the page from the page tables to make sure +	 * that we serialize and update with any other user of that +	 * mapping. +	 */ +	if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) +		return 0; +	/* +	 * And if we have a fault or a nopfn routine, it's not an +	 * anonymous region. +	 */ +	return !vma->vm_ops || +		(!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} +  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,  		unsigned long start, int len, int write, int force,  		struct page **pages, struct vm_area_struct **vmas) @@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,  		foll_flags = FOLL_TOUCH;  		if (pages)  			foll_flags |= FOLL_GET; -		if (!write && !(vma->vm_flags & VM_LOCKED) && -		    (!vma->vm_ops || !vma->vm_ops->fault)) +		if (!write && use_zero_page(vma))  			foll_flags |= FOLL_ANON;  		do { @@ -1766,7 +1785,6 @@ gotten:  	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);  	if (likely(pte_same(*page_table, orig_pte))) {  		if (old_page) { -			page_remove_rmap(old_page, vma);  			if (!PageAnon(old_page)) {  				dec_mm_counter(mm, file_rss);  				inc_mm_counter(mm, anon_rss); @@ -1788,6 +1806,32 @@ gotten:  		lru_cache_add_active(new_page);  		page_add_new_anon_rmap(new_page, vma, address); +		if (old_page) { +			/* +			 * Only after switching the pte to the new page may +			 * we remove the mapcount here. Otherwise another +			 * process may come and find the rmap count decremented +			 * before the pte is switched to the new page, and +			 * "reuse" the old page writing into it while our pte +			 * here still points into it and can be read by other +			 * threads. +			 * +			 * The critical issue is to order this +			 * page_remove_rmap with the ptp_clear_flush above. +			 * Those stores are ordered by (if nothing else,) +			 * the barrier present in the atomic_add_negative +			 * in page_remove_rmap. +			 * +			 * Then the TLB flush in ptep_clear_flush ensures that +			 * no process can access the old page before the +			 * decremented mapcount is visible. And the old page +			 * cannot be reused until after the decremented +			 * mapcount is visible. So transitively, TLBs to +			 * old page will be flushed before it can be reused. +			 */ +			page_remove_rmap(old_page, vma); +		} +  		/* Free the old page.. */  		new_page = old_page;  		ret |= VM_FAULT_WRITE; diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c index 91d14224f6b3..73d4572d136b 100644 --- a/sound/isa/sb/sb_mixer.c +++ b/sound/isa/sb/sb_mixer.c @@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = {  static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)  {  	unsigned char *val = chip->saved_regs; -	snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); +	snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);  	for (; num_regs; num_regs--)  		*val++ = snd_sbmixer_read(chip, *regs++);  } @@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)  static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)  {  	unsigned char *val = chip->saved_regs; -	snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); +	snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);  	for (; num_regs; num_regs--)  		snd_sbmixer_write(chip, *regs++, *val++);  } diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c index 56f87cd33c19..3f00ddf450f8 100644 --- a/sound/pci/aw2/aw2-alsa.c +++ b/sound/pci/aw2/aw2-alsa.c @@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card,  		return -ENOMEM;  	} +	/* (2) initialization of the chip hardware */ +	snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);  	if (request_irq(pci->irq, snd_aw2_saa7146_interrupt,  			IRQF_SHARED, "Audiowerk2", chip)) { @@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card,  	}  	chip->irq = pci->irq; -	/* (2) initialization of the chip hardware */ -	snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);  	err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);  	if (err < 0) {  		free_irq(chip->irq, (void *)chip); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 98778cb69c6e..1dcf9f3d1107 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)  	}  } -static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)  { -	int i; - -	for (i = 0; i < IOAPIC_NUM_PINS; i++) -		if (ioapic->redirtbl[i].fields.vector == vector) -			return i; -	return -1; -} - -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) -{ -	struct kvm_ioapic *ioapic = kvm->arch.vioapic;  	union ioapic_redir_entry *ent; -	int gsi; - -	gsi = get_eoi_gsi(ioapic, vector); -	if (gsi == -1) { -		printk(KERN_WARNING "Can't find redir item for %d EOI\n", -		       vector); -		return; -	}  	ent = &ioapic->redirtbl[gsi];  	ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); @@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)  		ioapic_deliver(ioapic, gsi);  } +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +{ +	struct kvm_ioapic *ioapic = kvm->arch.vioapic; +	int i; + +	for (i = 0; i < IOAPIC_NUM_PINS; i++) +		if (ioapic->redirtbl[i].fields.vector == vector) +			__kvm_ioapic_update_eoi(ioapic, i); +} +  static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)  {  	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 
