diff options
| author | Ingo Molnar <mingo@elte.hu> | 2008-06-25 12:28:47 +0200 | 
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-06-25 12:28:47 +0200 | 
| commit | f4628e644c34d9e6242ea18487b2ed58ee04e3eb (patch) | |
| tree | 3e1cd355fb6959d5b9f569342e4533a48e53aa11 /arch/x86/kernel | |
| parent | cb9aa97c21c59ad01c9514d7faf45dc166fba226 (diff) | |
| parent | 543cf4cb3fe6f6cae3651ba918b9c56200b257d0 (diff) | |
Merge branch 'linus' into tracing/mmiotrace-mergefixupstip-tracing-mmiotrace-mergefixups-2008-06-25_10.28_Wed
Diffstat (limited to 'arch/x86/kernel')
| -rw-r--r-- | arch/x86/kernel/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/geode_32.c | 5 | ||||
| -rw-r--r-- | arch/x86/kernel/kvmclock.c | 89 | ||||
| -rw-r--r-- | arch/x86/kernel/process_32.c | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/process_64.c | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/pvclock.c | 141 | ||||
| -rw-r--r-- | arch/x86/kernel/setup_32.c | 10 | ||||
| -rw-r--r-- | arch/x86/kernel/tsc_32.c | 18 | 
8 files changed, 197 insertions, 69 deletions
| diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 739d49acd2f1..5ff67208d4ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -90,6 +90,7 @@ obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o  obj-$(CONFIG_KVM_GUEST)		+= kvm.o  obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o  obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o  obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index e8edd63ab000..9b08e852fd1a 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -166,6 +166,8 @@ int geode_has_vsa2(void)  	static int has_vsa2 = -1;  	if (has_vsa2 == -1) { +		u16 val; +  		/*  		 * The VSA has virtual registers that we can query for a  		 * signature. @@ -173,7 +175,8 @@ int geode_has_vsa2(void)  		outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);  		outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); -		has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG); +		val = inw(VSA_VRC_DATA); +		has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);  	}  	return has_vsa2; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d472..87edf1ceb1df 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@  #include <linux/clocksource.h>  #include <linux/kvm_para.h> +#include <asm/pvclock.h>  #include <asm/arch_hooks.h>  #include <asm/msr.h>  #include <asm/apic.h> @@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)  early_param("no-kvmclock", parse_no_kvmclock);  /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field +static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_wall_clock wall_clock; -static inline u64 kvm_get_delta(u64 last_tsc) -{ -	int cpu = smp_processor_id(); -	u64 delta = native_read_tsc() - last_tsc; -	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} - -static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void);  /*   * The wallclock is the time of day when we booted. Since then, some time may   * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);   */  static unsigned long kvm_get_wallclock(void)  { -	u32 wc_sec, wc_nsec; -	u64 delta; +	struct pvclock_vcpu_time_info *vcpu_time;  	struct timespec ts; -	int version, nsec;  	int low, high;  	low = (int)__pa(&wall_clock);  	high = ((u64)__pa(&wall_clock) >> 32); +	native_write_msr(MSR_KVM_WALL_CLOCK, low, high); -	delta = kvm_clock_read(); +	vcpu_time = &get_cpu_var(hv_clock); +	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); +	put_cpu_var(hv_clock); -	native_write_msr(MSR_KVM_WALL_CLOCK, low, high); -	do { -		version = wall_clock.wc_version; -		rmb(); -		wc_sec = wall_clock.wc_sec; -		wc_nsec = wall_clock.wc_nsec; -		rmb(); -	} while ((wall_clock.wc_version != version) || (version & 1)); - -	delta = kvm_clock_read() - delta; -	delta += wc_nsec; -	nsec = do_div(delta, NSEC_PER_SEC); -	set_normalized_timespec(&ts, wc_sec + delta, nsec); -	/* -	 * Of all mechanisms of time adjustment I've tested, this one -	 * was the champion! -	 */ -	return ts.tv_sec + 1; +	return ts.tv_sec;  }  static int kvm_set_wallclock(unsigned long now)  { -	return 0; +	return -1;  } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */  static cycle_t kvm_clock_read(void)  { -	u64 last_tsc, now; -	int cpu; +	struct pvclock_vcpu_time_info *src; +	cycle_t ret; -	preempt_disable(); -	cpu = smp_processor_id(); - -	last_tsc = get_clock(cpu, tsc_timestamp); -	now = get_clock(cpu, system_time); - -	now += kvm_get_delta(last_tsc); -	preempt_enable(); - -	return now; +	src = &get_cpu_var(hv_clock); +	ret = pvclock_clocksource_read(src); +	put_cpu_var(hv_clock); +	return ret;  } +  static struct clocksource kvm_clock = {  	.name = "kvm-clock",  	.read = kvm_clock_read, @@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {  	.flags = CLOCK_SOURCE_IS_CONTINUOUS,  }; -static int kvm_register_clock(void) +static int kvm_register_clock(char *txt)  {  	int cpu = smp_processor_id();  	int low, high;  	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;  	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); - +	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", +	       cpu, high, low, txt);  	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);  } @@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)  	 * Now that the first cpu already had this clocksource initialized,  	 * we shouldn't fail.  	 */ -	WARN_ON(kvm_register_clock()); +	WARN_ON(kvm_register_clock("secondary cpu clock"));  	/* ok, done with our trickery, call native */  	setup_secondary_APIC_clock();  }  #endif +#ifdef CONFIG_SMP +void __init kvm_smp_prepare_boot_cpu(void) +{ +	WARN_ON(kvm_register_clock("primary cpu clock")); +	native_smp_prepare_boot_cpu(); +} +#endif +  /*   * After the clock is registered, the host will keep writing to the   * registered memory location. If the guest happens to shutdown, this memory @@ -174,7 +148,7 @@ void __init kvmclock_init(void)  		return;  	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { -		if (kvm_register_clock()) +		if (kvm_register_clock("boot clock"))  			return;  		pv_time_ops.get_wallclock = kvm_get_wallclock;  		pv_time_ops.set_wallclock = kvm_set_wallclock; @@ -182,6 +156,9 @@ void __init kvmclock_init(void)  #ifdef CONFIG_X86_LOCAL_APIC  		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;  #endif +#ifdef CONFIG_SMP +		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; +#endif  		machine_ops.shutdown  = kvm_shutdown;  #ifdef CONFIG_KEXEC  		machine_ops.crash_shutdown  = kvm_crash_shutdown; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 61f7481c31dd..347a7aba8b16 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -336,6 +336,7 @@ void flush_thread(void)  	/*  	 * Forget coprocessor state..  	 */ +	tsk->fpu_counter = 0;  	clear_fpu(tsk);  	clear_used_math();  } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index dc534f40c8d3..ea090e6cfe39 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -297,6 +297,7 @@ void flush_thread(void)  	/*  	 * Forget coprocessor state..  	 */ +	tsk->fpu_counter = 0;  	clear_fpu(tsk);  	clear_used_math();  } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,141 @@ +/*  paravirtual clock -- common code used by kvm/xen + +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 2 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, write to the Free Software +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA +*/ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <asm/pvclock.h> + +/* + * These are perodically updated + *    xen: magic shared_info page + *    kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { +	u64 tsc_timestamp;     /* TSC at last update of time vals.  */ +	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */ +	u32 tsc_to_nsec_mul; +	int tsc_shift; +	u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ +	u64 product; +#ifdef __i386__ +	u32 tmp1, tmp2; +#endif + +	if (shift < 0) +		delta >>= -shift; +	else +		delta <<= shift; + +#ifdef __i386__ +	__asm__ ( +		"mul  %5       ; " +		"mov  %4,%%eax ; " +		"mov  %%edx,%4 ; " +		"mul  %5       ; " +		"xor  %5,%5    ; " +		"add  %4,%%eax ; " +		"adc  %5,%%edx ; " +		: "=A" (product), "=r" (tmp1), "=r" (tmp2) +		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ +	__asm__ ( +		"mul %%rdx ; shrd $32,%%rdx,%%rax" +		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + +	return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ +	u64 delta = native_read_tsc() - shadow->tsc_timestamp; +	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, +					struct pvclock_vcpu_time_info *src) +{ +	do { +		dst->version = src->version; +		rmb();		/* fetch version before data */ +		dst->tsc_timestamp     = src->tsc_timestamp; +		dst->system_timestamp  = src->system_time; +		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul; +		dst->tsc_shift         = src->tsc_shift; +		rmb();		/* test version after fetching data */ +	} while ((src->version & 1) || (dst->version != src->version)); + +	return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ +	struct pvclock_shadow_time shadow; +	unsigned version; +	cycle_t ret, offset; + +	do { +		version = pvclock_get_time_values(&shadow, src); +		barrier(); +		offset = pvclock_get_nsec_offset(&shadow); +		ret = shadow.system_timestamp + offset; +		barrier(); +	} while (version != src->version); + +	return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, +			    struct pvclock_vcpu_time_info *vcpu_time, +			    struct timespec *ts) +{ +	u32 version; +	u64 delta; +	struct timespec now; + +	/* get wallclock at system boot */ +	do { +		version = wall_clock->version; +		rmb();		/* fetch version before time */ +		now.tv_sec  = wall_clock->sec; +		now.tv_nsec = wall_clock->nsec; +		rmb();		/* fetch time before checking version */ +	} while ((wall_clock->version & 1) || (version != wall_clock->version)); + +	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */ +	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + +	now.tv_nsec = do_div(delta, NSEC_PER_SEC); +	now.tv_sec = delta; + +	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2c5f8b213e86..5a2f8e063887 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void)  					(unsigned long)(crash_size >> 20),  					(unsigned long)(crash_base >> 20),  					(unsigned long)(total_mem >> 20)); + +			if (reserve_bootmem(crash_base, crash_size, +					BOOTMEM_EXCLUSIVE) < 0) { +				printk(KERN_INFO "crashkernel reservation " +					"failed - memory is in use\n"); +				return; +			} +  			crashk_res.start = crash_base;  			crashk_res.end   = crash_base + crash_size - 1; -			reserve_bootmem(crash_base, crash_size, -					BOOTMEM_DEFAULT);  		} else  			printk(KERN_INFO "crashkernel reservation failed - "  					"you have to specify a base address\n"); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 068759db63dd..65b70637ad97 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -14,7 +14,10 @@  #include "mach_timer.h" -static int tsc_disabled; +/* native_sched_clock() is called before tsc_init(), so +   we must start with the TSC soft disabled to prevent +   erroneous rdtsc usage on !cpu_has_tsc processors */ +static int tsc_disabled = -1;  /*   * On some systems the TSC frequency does not @@ -402,25 +405,20 @@ void __init tsc_init(void)  {  	int cpu; -	if (!cpu_has_tsc || tsc_disabled) { -		/* Disable the TSC in case of !cpu_has_tsc */ -		tsc_disabled = 1; +	if (!cpu_has_tsc || tsc_disabled > 0)  		return; -	}  	cpu_khz = calculate_cpu_khz();  	tsc_khz = cpu_khz;  	if (!cpu_khz) {  		mark_tsc_unstable("could not calculate TSC khz"); -		/* -		 * We need to disable the TSC completely in this case -		 * to prevent sched_clock() from using it. -		 */ -		tsc_disabled = 1;  		return;  	} +	/* now allow native_sched_clock() to use rdtsc */ +	tsc_disabled = 0; +  	printk("Detected %lu.%03lu MHz processor.\n",  				(unsigned long)cpu_khz / 1000,  				(unsigned long)cpu_khz % 1000); | 
