diff options
Diffstat (limited to 'arch/x86')
123 files changed, 5405 insertions, 1803 deletions
| diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 57ccdcec1469..0eacb1ffb421 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86  	select ARCH_WANT_FRAME_POINTERS  	select HAVE_DMA_ATTRS  	select HAVE_KRETPROBES +	select HAVE_OPTPROBES  	select HAVE_FTRACE_MCOUNT_RECORD  	select HAVE_DYNAMIC_FTRACE  	select HAVE_FUNCTION_TRACER @@ -101,6 +102,9 @@ config ZONE_DMA  config SBUS  	bool +config NEED_DMA_MAP_STATE +       def_bool (X86_64 || DMAR || DMA_API_DEBUG) +  config GENERIC_ISA_DMA  	def_bool y @@ -392,8 +396,12 @@ config X86_ELAN  config X86_MRST         bool "Moorestown MID platform" +	depends on PCI +	depends on PCI_GOANY  	depends on X86_32  	depends on X86_EXTENDED_PLATFORM +	depends on X86_IO_APIC +	select APB_TIMER  	---help---  	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin  	  Internet Device(MID) platform. Moorestown consists of two chips: @@ -428,6 +436,7 @@ config X86_32_NON_STANDARD  config X86_NUMAQ  	bool "NUMAQ (IBM/Sequent)"  	depends on X86_32_NON_STANDARD +	depends on PCI  	select NUMA  	select X86_MPPARSE  	---help--- @@ -628,6 +637,16 @@ config HPET_EMULATE_RTC  	def_bool y  	depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) +config APB_TIMER +       def_bool y if MRST +       prompt "Langwell APB Timer Support" if X86_MRST +       help +         APB timer is the replacement for 8254, HPET on X86 MID platforms. +         The APBT provides a stable time base on SMP +         systems, unlike the TSC, but it is more expensive to access, +         as it is off-chip. APB timers are always running regardless of CPU +         C states, they are used as per CPU clockevent device when possible. +  # Mark as embedded because too many people got it wrong.  # The code disables itself when not needed.  config DMI @@ -643,7 +662,7 @@ config GART_IOMMU  	bool "GART IOMMU support" if EMBEDDED  	default y  	select SWIOTLB -	depends on X86_64 && PCI +	depends on X86_64 && PCI && K8_NB  	---help---  	  Support for full DMA access of devices with 32bit memory access only  	  on systems with more than 3GB. This is usually needed for USB, @@ -2042,7 +2061,7 @@ endif # X86_32  config K8_NB  	def_bool y -	depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) +	depends on CPU_SUP_AMD && PCI  source "drivers/pcmcia/Kconfig" diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 39b98ed2c1b9..575331cb2a8a 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -22,7 +22,7 @@  #include <asm/asm-offsets.h> -/* return adress at 0 */ +/* return address at 0 */  #define in_blk    12  /* input byte array address parameter*/  #define out_blk   8  /* output byte array address parameter*/ @@ -230,8 +230,8 @@ twofish_enc_blk:  	push    %edi  	mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */ -	add	$crypto_tfm_ctx_offset, %ebp	/* ctx adress */ -	mov     in_blk+16(%esp),%edi	/* input adress in edi */ +	add	$crypto_tfm_ctx_offset, %ebp	/* ctx address */ +	mov     in_blk+16(%esp),%edi	/* input address in edi */  	mov	(%edi),		%eax  	mov	b_offset(%edi),	%ebx @@ -286,8 +286,8 @@ twofish_dec_blk:  	mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */ -	add	$crypto_tfm_ctx_offset, %ebp	/* ctx adress */ -	mov     in_blk+16(%esp),%edi	/* input adress in edi */ +	add	$crypto_tfm_ctx_offset, %ebp	/* ctx address */ +	mov     in_blk+16(%esp),%edi	/* input address in edi */  	mov	(%edi),		%eax  	mov	b_offset(%edi),	%ebx diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 35974a586615..573aa102542e 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -221,11 +221,11 @@  twofish_enc_blk:  	pushq    R1 -	/* %rdi contains the crypto tfm adress */ -	/* %rsi contains the output adress */ -	/* %rdx contains the input adress */ -	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx adress */ -	/* ctx adress is moved to free one non-rex register +	/* %rdi contains the crypto tfm address */ +	/* %rsi contains the output address */ +	/* %rdx contains the input address */ +	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx address */ +	/* ctx address is moved to free one non-rex register  	as target for the 8bit high operations */  	mov	%rdi,		%r11 @@ -274,11 +274,11 @@ twofish_enc_blk:  twofish_dec_blk:  	pushq    R1 -	/* %rdi contains the crypto tfm adress */ -	/* %rsi contains the output adress */ -	/* %rdx contains the input adress */ -	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx adress */ -	/* ctx adress is moved to free one non-rex register +	/* %rdi contains the crypto tfm address */ +	/* %rsi contains the output address */ +	/* %rdx contains the input address */ +	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx address */ +	/* ctx address is moved to free one non-rex register  	as target for the 8bit high operations */  	mov	%rdi,		%r11 diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 53147ad85b96..59b4556a5b92 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -563,7 +563,7 @@ ia32_sys_call_table:  	.quad quiet_ni_syscall			/* old mpx syscall holder */  	.quad sys_setpgid  	.quad quiet_ni_syscall			/* old ulimit syscall holder */ -	.quad sys32_olduname +	.quad sys_olduname  	.quad sys_umask		/* 60 */  	.quad sys_chroot  	.quad compat_sys_ustat @@ -586,7 +586,7 @@ ia32_sys_call_table:  	.quad compat_sys_settimeofday  	.quad sys_getgroups16	/* 80 */  	.quad sys_setgroups16 -	.quad sys32_old_select +	.quad compat_sys_old_select  	.quad sys_symlink  	.quad sys_lstat  	.quad sys_readlink		/* 85 */ @@ -613,7 +613,7 @@ ia32_sys_call_table:  	.quad compat_sys_newstat  	.quad compat_sys_newlstat  	.quad compat_sys_newfstat -	.quad sys32_uname +	.quad sys_uname  	.quad stub32_iopl		/* 110 */  	.quad sys_vhangup  	.quad quiet_ni_syscall	/* old "idle" system call */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 422572c77923..74c35431b7d8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -143,7 +143,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,   * block for parameter passing..   */ -struct mmap_arg_struct { +struct mmap_arg_struct32 {  	unsigned int addr;  	unsigned int len;  	unsigned int prot; @@ -152,9 +152,9 @@ struct mmap_arg_struct {  	unsigned int offset;  }; -asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)  { -	struct mmap_arg_struct a; +	struct mmap_arg_struct32 a;  	if (copy_from_user(&a, arg, sizeof(a)))  		return -EFAULT; @@ -332,24 +332,6 @@ asmlinkage long sys32_alarm(unsigned int seconds)  	return alarm_setitimer(seconds);  } -struct sel_arg_struct { -	unsigned int n; -	unsigned int inp; -	unsigned int outp; -	unsigned int exp; -	unsigned int tvp; -}; - -asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg) -{ -	struct sel_arg_struct a; - -	if (copy_from_user(&a, arg, sizeof(a))) -		return -EFAULT; -	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), -				 compat_ptr(a.exp), compat_ptr(a.tvp)); -} -  asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,  			      int options)  { @@ -466,58 +448,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,  	return ret;  } -asmlinkage long sys32_olduname(struct oldold_utsname __user *name) -{ -	char *arch = "x86_64"; -	int err; - -	if (!name) -		return -EFAULT; -	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) -		return -EFAULT; - -	down_read(&uts_sem); - -	err = __copy_to_user(&name->sysname, &utsname()->sysname, -			     __OLD_UTS_LEN); -	err |= __put_user(0, name->sysname+__OLD_UTS_LEN); -	err |= __copy_to_user(&name->nodename, &utsname()->nodename, -			      __OLD_UTS_LEN); -	err |= __put_user(0, name->nodename+__OLD_UTS_LEN); -	err |= __copy_to_user(&name->release, &utsname()->release, -			      __OLD_UTS_LEN); -	err |= __put_user(0, name->release+__OLD_UTS_LEN); -	err |= __copy_to_user(&name->version, &utsname()->version, -			      __OLD_UTS_LEN); -	err |= __put_user(0, name->version+__OLD_UTS_LEN); - -	if (personality(current->personality) == PER_LINUX32) -		arch = "i686"; - -	err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1); - -	up_read(&uts_sem); - -	err = err ? -EFAULT : 0; - -	return err; -} - -long sys32_uname(struct old_utsname __user *name) -{ -	int err; - -	if (!name) -		return -EFAULT; -	down_read(&uts_sem); -	err = copy_to_user(name, utsname(), sizeof(*name)); -	up_read(&uts_sem); -	if (personality(current->personality) == PER_LINUX32) -		err |= copy_to_user(&name->machine, "i686", 5); - -	return err ? -EFAULT : 0; -} -  asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,  			     compat_uptr_t __user *envp, struct pt_regs *regs)  { diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 9f828f87ca35..493092efaa3b 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,6 +11,7 @@ header-y += sigcontext32.h  header-y += ucontext.h  header-y += processor-flags.h  header-y += hw_breakpoint.h +header-y += hyperv.h  unifdef-y += e820.h  unifdef-y += ist.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f1e253ceba4b..b09ec55650b3 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -165,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,   * invalid instruction possible) or if the instructions are changed from a   * consistent state to another consistent state atomically.   * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying.   * On the local CPU you need to be protected again NMI or MCE handlers seeing an   * inconsistent instruction while you patch.   */  extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len);  #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h new file mode 100644 index 000000000000..c74a2eebe570 --- /dev/null +++ b/arch/x86/include/asm/apb_timer.h @@ -0,0 +1,70 @@ +/* + * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + */ + +#ifndef ASM_X86_APBT_H +#define ASM_X86_APBT_H +#include <linux/sfi.h> + +#ifdef CONFIG_APB_TIMER + +/* Langwell DW APB timer registers */ +#define APBTMR_N_LOAD_COUNT    0x00 +#define APBTMR_N_CURRENT_VALUE 0x04 +#define APBTMR_N_CONTROL       0x08 +#define APBTMR_N_EOI           0x0c +#define APBTMR_N_INT_STATUS    0x10 + +#define APBTMRS_INT_STATUS     0xa0 +#define APBTMRS_EOI            0xa4 +#define APBTMRS_RAW_INT_STATUS 0xa8 +#define APBTMRS_COMP_VERSION   0xac +#define APBTMRS_REG_SIZE       0x14 + +/* register bits */ +#define APBTMR_CONTROL_ENABLE  (1<<0) +#define APBTMR_CONTROL_MODE_PERIODIC   (1<<1) /*1: periodic 0:free running */ +#define APBTMR_CONTROL_INT     (1<<2) + +/* default memory mapped register base */ +#define LNW_SCU_ADDR           0xFF100000 +#define LNW_EXT_TIMER_OFFSET   0x1B800 +#define APBT_DEFAULT_BASE      (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET) +#define LNW_EXT_TIMER_PGOFFSET         0x800 + +/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ +#define APBT_MAX_FREQ          50 +#define APBT_MIN_FREQ          1 +#define APBT_MMAP_SIZE         1024 + +#define APBT_DEV_USED  1 + +extern void apbt_time_init(void); +extern struct clock_event_device *global_clock_event; +extern unsigned long apbt_quick_calibrate(void); +extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); +extern void apbt_setup_secondary_clock(void); +extern unsigned int boot_cpu_id; +extern int disable_apbt_percpu; + +extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); +extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); +extern int sfi_mtimer_num; + +#else /* CONFIG_APB_TIMER */ + +static inline unsigned long apbt_quick_calibrate(void) {return 0; } +static inline void apbt_time_init(void) {return 0; } + +#endif +#endif /* ASM_X86_APBT_H */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 9a9c7bdc923d..306160e58b48 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -8,7 +8,8 @@  #include <linux/sched.h>  #include <asm/user32.h> -#define COMPAT_USER_HZ	100 +#define COMPAT_USER_HZ		100 +#define COMPAT_UTS_MACHINE	"i686\0\0"  typedef u32		compat_size_t;  typedef s32		compat_ssize_t; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 635f03bb4995..d07b44f7d1dc 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ enum fixed_addresses {  #endif  	FIX_DBGP_BASE,  	FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT +	FIX_OHCI1394_BASE, +#endif  #ifdef CONFIG_X86_LOCAL_APIC  	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */  #endif @@ -132,9 +135,6 @@ enum fixed_addresses {  	   (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))  	 : __end_of_permanent_fixed_addresses,  	FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT -	FIX_OHCI1394_BASE, -#endif  #ifdef CONFIG_X86_32  	FIX_WP_TEST,  #endif diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 0675a7c4c20e..2a1bd8f4f23a 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -10,7 +10,6 @@   * (display/resolving)   */  struct arch_hw_breakpoint { -	char		*name; /* Contains name of the symbol to set bkpt */  	unsigned long	address;  	u8		len;  	u8		type; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eeac829a0f44..46c0fe05f230 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -53,13 +53,6 @@ extern void threshold_interrupt(void);  extern void call_function_interrupt(void);  extern void call_function_single_interrupt(void); -/* PIC specific functions */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); -  /* IOAPIC */  #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))  extern unsigned long io_apic_irqs; @@ -140,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);  typedef int vector_irq_t[NR_VECTORS];  DECLARE_PER_CPU(vector_irq_t, vector_irq); +extern void setup_vector_irq(int cpu);  #ifdef CONFIG_X86_IO_APIC  extern void lock_vector_lock(void); diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h new file mode 100644 index 000000000000..e153a2b3889a --- /dev/null +++ b/arch/x86/include/asm/hyperv.h @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS	0x40000000 +#define HYPERV_CPUID_INTERFACE			0x40000001 +#define HYPERV_CPUID_VERSION			0x40000002 +#define HYPERV_CPUID_FEATURES			0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005 + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE		(1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE	(1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE		(1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE		(1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE	(1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE		(1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE		(1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE		(1 << 7) + /* +  * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, +  * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, +  * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available +  */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE		(1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS		(1 << 0) +#define HV_X64_ACCESS_PARTITION_ID		(1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL		(1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS		(1 << 3) +#define HV_X64_POST_MESSAGES			(1 << 4) +#define HV_X64_SIGNAL_EVENTS			(1 << 5) +#define HV_X64_CREATE_PORT			(1 << 6) +#define HV_X64_CONNECT_PORT			(1 << 7) +#define HV_X64_ACCESS_STATS			(1 << 8) +#define HV_X64_DEBUGGING			(1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT		(1 << 12) +#define HV_X64_CONFIGURE_PROFILER		(1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE				(1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE		(1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE			(1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	(1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* +  * Recommend using hypercall for address space switches rather +  * than MOV to CR3 instruction +  */ +#define HV_X64_MWAIT_RECOMMENDED		(1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED	(1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED	(1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED		(1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED		(1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED	(1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID			0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL			0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX			0x40000002 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI				0x40000070 +#define HV_X64_MSR_ICR				0x40000071 +#define HV_X64_MSR_TPR				0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE		0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL			0x40000080 +#define HV_X64_MSR_SVERSION			0x40000081 +#define HV_X64_MSR_SIEFP			0x40000082 +#define HV_X64_MSR_SIMP				0x40000083 +#define HV_X64_MSR_EOM				0x40000084 +#define HV_X64_MSR_SINT0			0x40000090 +#define HV_X64_MSR_SINT1			0x40000091 +#define HV_X64_MSR_SINT2			0x40000092 +#define HV_X64_MSR_SINT3			0x40000093 +#define HV_X64_MSR_SINT4			0x40000094 +#define HV_X64_MSR_SINT5			0x40000095 +#define HV_X64_MSR_SINT6			0x40000096 +#define HV_X64_MSR_SINT7			0x40000097 +#define HV_X64_MSR_SINT8			0x40000098 +#define HV_X64_MSR_SINT9			0x40000099 +#define HV_X64_MSR_SINT10			0x4000009A +#define HV_X64_MSR_SINT11			0x4000009B +#define HV_X64_MSR_SINT12			0x4000009C +#define HV_X64_MSR_SINT13			0x4000009D +#define HV_X64_MSR_SINT14			0x4000009E +#define HV_X64_MSR_SINT15			0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\ +		(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT		0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE		0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT	12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK	\ +		(~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0		0 +#define HV_PROCESSOR_POWER_STATE_C1		1 +#define HV_PROCESSOR_POWER_STATE_C2		2 +#define HV_PROCESSOR_POWER_STATE_C3		3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS			0 +#define HV_STATUS_INVALID_HYPERCALL_CODE	2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT	3 +#define HV_STATUS_INVALID_ALIGNMENT		4 + +#endif diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 7ec65b18085d..1655147646aa 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -26,11 +26,6 @@ extern unsigned int cached_irq_mask;  extern raw_spinlock_t i8259A_lock; -extern void init_8259A(int auto_eoi); -extern void enable_8259A_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern unsigned int startup_8259A_irq(unsigned int irq); -  /* the PIC may need a careful delay on some platforms, hence specific calls */  static inline unsigned char inb_pic(unsigned int port)  { @@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port)  extern struct irq_chip i8259A_chip; -extern void mask_8259A(void); -extern void unmask_8259A(void); +struct legacy_pic { +	int nr_legacy_irqs; +	struct irq_chip *chip; +	void (*mask_all)(void); +	void (*restore_mask)(void); +	void (*init)(int auto_eoi); +	int (*irq_pending)(unsigned int irq); +	void (*make_irq)(unsigned int irq); +}; + +extern struct legacy_pic *legacy_pic; +extern struct legacy_pic null_legacy_pic;  #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 5f61f6e0ffdd..35832a03a515 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,8 +143,6 @@ extern int noioapicreroute;  /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */  extern int timer_through_8259; -extern void io_apic_disable_legacy(void); -  /*   * If we use the IO-APIC for IRQ routing, disable automatic   * assignment of PCI IRQ's. @@ -189,6 +187,7 @@ extern struct mp_ioapic_gsi  mp_gsi_routing[];  int mp_find_ioapic(int gsi);  int mp_find_ioapic_pin(int ioapic, int gsi);  void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); +extern void __init pre_init_apic_IRQ0(void);  #else  /* !CONFIG_X86_IO_APIC */ @@ -198,7 +197,11 @@ static const int timer_through_8259 = 0;  static inline void ioapic_init_mappings(void)	{ }  static inline void ioapic_insert_resources(void) { }  static inline void probe_nr_irqs_gsi(void)	{ } +static inline int mp_find_ioapic(int gsi) { return 0; } +struct io_apic_irq_attr; +static inline int io_apic_set_pci_routing(struct device *dev, int irq, +		 struct io_apic_irq_attr *irq_attr) { return 0; }  #endif  #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 262292729fc4..5458380b6ef8 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -48,6 +48,5 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS);  extern int vector_used_by_percpu_irq(unsigned int vector);  extern void init_ISA_irqs(void); -extern int nr_legacy_irqs;  #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4fe681de1e76..4ffa345a8ccb 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -32,7 +32,10 @@ struct kprobe;  typedef u8 kprobe_opcode_t;  #define BREAKPOINT_INSTRUCTION	0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4  #define MAX_INSN_SIZE 16  #define MAX_STACK_SIZE 64  #define MIN_STACK_SIZE(ADDR)					       \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;  #define flush_insn_slot(p)	do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE 				\ +	(((unsigned long)&optprobe_template_end -	\ +	  (unsigned long)&optprobe_template_entry) +	\ +	 MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) +  extern const int kretprobe_blacklist_size;  void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn {  	int boostable;  }; +struct arch_optimized_insn { +	/* copy of the original instructions */ +	kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; +	/* detour code buffer */ +	kprobe_opcode_t *insn; +	/* the size of instructions copied to detour code buffer */ +	size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ +	return optinsn->size; +} +  struct prev_kprobe {  	struct kprobe *kp;  	unsigned long status; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7c18e1230f54..7a6f54fa13ba 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ctxt;  struct x86_emulate_ops {  	/*  	 * read_std: Read bytes of standard (non-emulated/special) memory. -	 *           Used for instruction fetch, stack operations, and others. +	 *           Used for descriptor reading.  	 *  @addr:  [IN ] Linear address from which to read.  	 *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.  	 *  @bytes: [IN ] Number of bytes to read from memory.  	 */  	int (*read_std)(unsigned long addr, void *val, -			unsigned int bytes, struct kvm_vcpu *vcpu); +			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + +	/* +	 * fetch: Read bytes of standard (non-emulated/special) memory. +	 *        Used for instruction fetch. +	 *  @addr:  [IN ] Linear address from which to read. +	 *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'. +	 *  @bytes: [IN ] Number of bytes to read from memory. +	 */ +	int (*fetch)(unsigned long addr, void *val, +			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);  	/*  	 * read_emulated: Read bytes from emulated/special memory area. @@ -74,7 +84,7 @@ struct x86_emulate_ops {  			     struct kvm_vcpu *vcpu);  	/* -	 * write_emulated: Read bytes from emulated/special memory area. +	 * write_emulated: Write bytes to emulated/special memory area.  	 *  @addr:  [IN ] Linear address to which to write.  	 *  @val:   [IN ] Value to write to memory (low-order bytes used as  	 *                required). @@ -168,6 +178,7 @@ struct x86_emulate_ctxt {  /* Execution mode, passed to the emulator. */  #define X86EMUL_MODE_REAL     0	/* Real mode.             */ +#define X86EMUL_MODE_VM86     1	/* Virtual 8086 mode.     */  #define X86EMUL_MODE_PROT16   2	/* 16-bit protected mode. */  #define X86EMUL_MODE_PROT32   4	/* 32-bit protected mode. */  #define X86EMUL_MODE_PROT64   8	/* 64-bit (long) mode.    */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8b8540..06d9e79ca37d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,7 +25,7 @@  #include <asm/mtrr.h>  #include <asm/msr-index.h> -#define KVM_MAX_VCPUS 16 +#define KVM_MAX_VCPUS 64  #define KVM_MEMORY_SLOTS 32  /* memory slots that does not exposed to userspace */  #define KVM_PRIVATE_MEM_SLOTS 4 @@ -38,19 +38,6 @@  #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |	\  				  0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\ -	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK						\ -	(KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST				\ -	(X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON						\ -	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK						\ -	(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) -  #define INVALID_PAGE (~(hpa_t)0)  #define UNMAPPED_GVA (~(gpa_t)0) @@ -256,7 +243,8 @@ struct kvm_mmu {  	void (*new_cr3)(struct kvm_vcpu *vcpu);  	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);  	void (*free)(struct kvm_vcpu *vcpu); -	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); +	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, +			    u32 *error);  	void (*prefetch_page)(struct kvm_vcpu *vcpu,  			      struct kvm_mmu_page *page);  	int (*sync_page)(struct kvm_vcpu *vcpu, @@ -282,13 +270,15 @@ struct kvm_vcpu_arch {  	u32 regs_dirty;  	unsigned long cr0; +	unsigned long cr0_guest_owned_bits;  	unsigned long cr2;  	unsigned long cr3;  	unsigned long cr4; +	unsigned long cr4_guest_owned_bits;  	unsigned long cr8;  	u32 hflags;  	u64 pdptrs[4]; /* pae */ -	u64 shadow_efer; +	u64 efer;  	u64 apic_base;  	struct kvm_lapic *apic;    /* kernel irqchip context */  	int32_t apic_arb_prio; @@ -374,17 +364,27 @@ struct kvm_vcpu_arch {  	/* used for guest single stepping over the given code position */  	u16 singlestep_cs;  	unsigned long singlestep_rip; +	/* fields used by HYPER-V emulation */ +	u64 hv_vapic;  };  struct kvm_mem_alias {  	gfn_t base_gfn;  	unsigned long npages;  	gfn_t target_gfn; +#define KVM_ALIAS_INVALID     1UL +	unsigned long flags;  }; -struct kvm_arch{ -	int naliases; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases {  	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; +	int naliases; +}; + +struct kvm_arch { +	struct kvm_mem_aliases *aliases;  	unsigned int n_free_mmu_pages;  	unsigned int n_requested_mmu_pages; @@ -416,6 +416,10 @@ struct kvm_arch{  	s64 kvmclock_offset;  	struct kvm_xen_hvm_config xen_hvm_config; + +	/* fields used by HYPER-V emulation */ +	u64 hv_guest_os_id; +	u64 hv_hypercall;  };  struct kvm_vm_stat { @@ -471,6 +475,7 @@ struct kvm_x86_ops {  	int (*hardware_setup)(void);               /* __init */  	void (*hardware_unsetup)(void);            /* __exit */  	bool (*cpu_has_accelerated_tpr)(void); +	void (*cpuid_update)(struct kvm_vcpu *vcpu);  	/* Create, but do not attach this VCPU */  	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); @@ -492,6 +497,7 @@ struct kvm_x86_ops {  	void (*set_segment)(struct kvm_vcpu *vcpu,  			    struct kvm_segment *var, int seg);  	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); +	void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);  	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);  	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);  	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -501,12 +507,13 @@ struct kvm_x86_ops {  	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);  	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);  	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); -	unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); -	void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, -		       int *exception); +	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); +	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);  	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);  	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);  	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); +	void (*fpu_activate)(struct kvm_vcpu *vcpu); +	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);  	void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -531,7 +538,8 @@ struct kvm_x86_ops {  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);  	int (*get_tdp_level)(void);  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); -	bool (*gb_page_enable)(void); +	int (*get_lpage_level)(void); +	bool (*rdtscp_supported)(void);  	const struct trace_print_flags *exit_reasons_str;  }; @@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,  		    unsigned long value);  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, -				int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);  int kvm_mmu_load(struct kvm_vcpu *vcpu);  void kvm_mmu_unload(struct kvm_vcpu *vcpu);  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -666,6 +677,7 @@ void kvm_disable_tdp(void);  int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);  int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu);  struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c584076a47f4..ffae1420e7d7 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@  #define _ASM_X86_KVM_PARA_H  #include <linux/types.h> +#include <asm/hyperv.h>  /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It   * should be used to determine that a VM is running under KVM. diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h new file mode 100644 index 000000000000..451d30e7f62d --- /dev/null +++ b/arch/x86/include/asm/mrst.h @@ -0,0 +1,19 @@ +/* + * mrst.h: Intel Moorestown platform specific setup code + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _ASM_X86_MRST_H +#define _ASM_X86_MRST_H +extern int pci_mrst_init(void); +int __init sfi_parse_mrtc(struct sfi_table_header *table); + +#define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX	8 + +#endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1cd58cdbc03f..4604e6a54d36 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -105,6 +105,8 @@  #define MSR_AMD64_PATCH_LEVEL		0x0000008b  #define MSR_AMD64_NB_CFG		0xc001001f  #define MSR_AMD64_PATCH_LOADER		0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH	0xc0010140 +#define MSR_AMD64_OSVW_STATUS		0xc0010141  #define MSR_AMD64_IBSFETCHCTL		0xc0011030  #define MSR_AMD64_IBSFETCHLINAD		0xc0011031  #define MSR_AMD64_IBSFETCHPHYSAD	0xc0011032 diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 13370b95ea94..37c516545ec8 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -30,6 +30,7 @@  extern int found_numaq;  extern int get_memcfg_numaq(void); +extern int pci_numaq_init(void);  extern void *xquad_portio; diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 3a57385d9fa7..101229b0d8ed 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -13,7 +13,6 @@ struct olpc_platform_t {  #define OLPC_F_PRESENT		0x01  #define OLPC_F_DCON		0x02 -#define OLPC_F_VSA		0x04  #ifdef CONFIG_OLPC @@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void)  }  /* - * The VSA is software from AMD that typical Geode bioses will include. - * It is used to emulate the PCI bus, VGA, etc.  OLPC's Open Firmware does - * not include the VSA; instead, PCI is emulated by the kernel. - * - * The VSA is described further in arch/x86/pci/olpc.c. - */ -static inline int olpc_has_vsa(void) -{ -	return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0; -} - -/*   * The "Mass Production" version of OLPC's XO is identified as being model   * C2.  During the prototype phase, the following models (in chronological   * order) were created: A1, B1, B2, B3, B4, C1.  The A1 through B2 models @@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void)  	return 0;  } -static inline int olpc_has_vsa(void) -{ -	return 0; -} -  #endif +extern int pci_olpc_init(void); +  /* EC related functions */  extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b4a00dd4eed5..404a880ea325 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus)  #ifdef CONFIG_PCI  extern unsigned int pcibios_assign_all_busses(void); +extern int pci_legacy_init(void); +# ifdef CONFIG_ACPI +#  define x86_default_pci_init pci_acpi_init +# else +#  define x86_default_pci_init pci_legacy_init +# endif  #else -#define pcibios_assign_all_busses()	0 +# define pcibios_assign_all_busses()	0 +# define x86_default_pci_init		NULL  #endif  extern unsigned long pci_mem_start; @@ -90,34 +97,6 @@ extern void pci_iommu_alloc(void);  #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)       \ -	        dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)         \ -	        __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME)                  \ -	        ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)         \ -	        (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME)                    \ -	        ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL)           \ -	        (((PTR)->LEN_NAME) = (VAL)) - -#else - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)       dma_addr_t ADDR_NAME[0]; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; -#define pci_unmap_addr(PTR, ADDR_NAME)  sizeof((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ -	        do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) -#define pci_unmap_len(PTR, LEN_NAME)            sizeof((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ -	        do { break; } while (pci_unmap_len(PTR, LEN_NAME)) - -#endif -  #endif  /* __KERNEL__ */  #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 05b58ccb2e82..1a0422348d6d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -83,7 +83,6 @@ struct irq_routing_table {  extern unsigned int pcibios_irq_mask; -extern int pcibios_scanned;  extern spinlock_t pci_config_lock;  extern int (*pcibios_enable_irq)(struct pci_dev *dev); @@ -106,16 +105,15 @@ extern bool port_cf9_safe;  extern int pci_direct_probe(void);  extern void pci_direct_init(int type);  extern void pci_pcbios_init(void); -extern int pci_olpc_init(void);  extern void __init dmi_check_pciprobe(void);  extern void __init dmi_check_skip_isa_align(void);  /* some common used subsys_initcalls */  extern int __init pci_acpi_init(void); -extern int __init pcibios_irq_init(void); -extern int __init pci_visws_init(void); -extern int __init pci_numaq_init(void); +extern void __init pcibios_irq_init(void);  extern int __init pcibios_init(void); +extern int pci_legacy_init(void); +extern void pcibios_fixup_irqs(void);  /* pci-mmconfig.c */ @@ -183,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)  {  	asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");  } + +#ifdef CONFIG_PCI +# ifdef CONFIG_ACPI +#  define x86_default_pci_init		pci_acpi_init +# else +#  define x86_default_pci_init		pci_legacy_init +# endif +# define x86_default_pci_init_irq	pcibios_irq_init +# define x86_default_pci_fixup_irqs	pcibios_fixup_irqs +#else +# define x86_default_pci_init		NULL +# define x86_default_pci_init_irq	NULL +# define x86_default_pci_fixup_irqs	NULL +#endif diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index befd172c82ad..db6109a885a7 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -18,7 +18,7 @@  #define MSR_ARCH_PERFMON_EVENTSEL0			     0x186  #define MSR_ARCH_PERFMON_EVENTSEL1			     0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ENABLE			  (1 << 22)  #define ARCH_PERFMON_EVENTSEL_ANY			  (1 << 21)  #define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)  #define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17) @@ -50,7 +50,7 @@  	 INTEL_ARCH_INV_MASK| \  	 INTEL_ARCH_EDGE_MASK|\  	 INTEL_ARCH_UNIT_MASK|\ -	 INTEL_ARCH_EVTSEL_MASK) +	 INTEL_ARCH_EVENT_MASK)  #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c  #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8) @@ -117,6 +117,18 @@ union cpuid10_edx {   */  #define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_RAND_EN		(1ULL<<57) +#define IBS_FETCH_VAL			(1ULL<<49) +#define IBS_FETCH_ENABLE		(1ULL<<48) +#define IBS_FETCH_CNT			0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT		0x0000FFFFULL + +/* IbsOpCtl bits */ +#define IBS_OP_CNT_CTL			(1ULL<<19) +#define IBS_OP_VAL			(1ULL<<18) +#define IBS_OP_ENABLE			(1ULL<<17) +#define IBS_OP_MAX_CNT			0x0000FFFFULL  #ifdef CONFIG_PERF_EVENTS  extern void init_hw_perf_events(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 20102808b191..69a686a7dff0 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -274,14 +274,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,  		return 0;  } -/* - * These are defined as per linux/ptrace.h, which see. - */  #define arch_has_single_step()	(1) -extern void user_enable_single_step(struct task_struct *); -extern void user_disable_single_step(struct task_struct *); - -extern void user_enable_block_step(struct task_struct *);  #ifdef CONFIG_X86_DEBUGCTLMSR  #define arch_has_block_step()	(1)  #else diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 18e496c98ff0..86b1506f4179 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -37,10 +37,8 @@ void setup_bios_corruption_check(void);  #ifdef CONFIG_X86_VISWS  extern void visws_early_detect(void); -extern int is_visws_box(void);  #else  static inline void visws_early_detect(void) { } -static inline int is_visws_box(void) { return 0; }  #endif  extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1fecb7e61130..38638cd2fa4c 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {  #define SVM_EXIT_ERR		-1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)  #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"  #define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8" diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index d5f69045c100..3ad421784ae7 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -26,8 +26,8 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);  asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);  asmlinkage long sys32_fstatat(unsigned int, char __user *,  			      struct stat64 __user *, int); -struct mmap_arg_struct; -asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); +struct mmap_arg_struct32; +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);  asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);  struct sigaction32; @@ -40,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,  				     compat_sigset_t __user *, unsigned int);  asmlinkage long sys32_alarm(unsigned int); -struct sel_arg_struct; -asmlinkage long sys32_old_select(struct sel_arg_struct __user *);  asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);  asmlinkage long sys32_sysfs(int, u32, u32); @@ -56,11 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);  asmlinkage long sys32_personality(unsigned long);  asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -struct oldold_utsname; -struct old_utsname; -asmlinkage long sys32_olduname(struct oldold_utsname __user *); -long sys32_uname(struct old_utsname __user *); -  asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,  			     compat_uptr_t __user *, struct pt_regs *);  asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 8868b9420b0e..5c044b43e9a7 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -50,18 +50,6 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,  			     struct old_sigaction __user *);  unsigned long sys_sigreturn(struct pt_regs *); -/* kernel/sys_i386_32.c */ -struct mmap_arg_struct; -struct sel_arg_struct; -struct oldold_utsname; -struct old_utsname; - -asmlinkage int old_mmap(struct mmap_arg_struct __user *); -asmlinkage int old_select(struct sel_arg_struct __user *); -asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -asmlinkage int sys_uname(struct old_utsname __user *); -asmlinkage int sys_olduname(struct oldold_utsname __user *); -  /* kernel/vm86_32.c */  int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);  int sys_vm86(unsigned long, unsigned long, struct pt_regs *); @@ -73,11 +61,8 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *);  long sys_arch_prctl(int, unsigned long);  /* kernel/sys_x86_64.c */ -struct new_utsname; -  asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,  			 unsigned long, unsigned long, unsigned long); -asmlinkage long sys_uname(struct new_utsname __user *);  #endif /* CONFIG_X86_32 */  #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 3baf379fa840..beb9b5f8f8a4 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -354,6 +354,7 @@  #define __ARCH_WANT_STAT64  #define __ARCH_WANT_SYS_ALARM  #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC  #define __ARCH_WANT_SYS_PAUSE  #define __ARCH_WANT_SYS_SGETMASK  #define __ARCH_WANT_SYS_SIGNAL @@ -366,6 +367,9 @@  #define __ARCH_WANT_SYS_LLSEEK  #define __ARCH_WANT_SYS_NICE  #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME +#define __ARCH_WANT_SYS_OLD_MMAP +#define __ARCH_WANT_SYS_OLD_SELECT  #define __ARCH_WANT_SYS_OLDUMOUNT  #define __ARCH_WANT_SYS_SIGPENDING  #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4843f7ba754a..ff4307b0e81e 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4)  #define __NR_kill				62  __SYSCALL(__NR_kill, sys_kill)  #define __NR_uname				63 -__SYSCALL(__NR_uname, sys_uname) +__SYSCALL(__NR_uname, sys_newuname)  #define __NR_semget				64  __SYSCALL(__NR_semget, sys_semget) @@ -680,6 +680,7 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg)  #define __ARCH_WANT_SYS_LLSEEK  #define __ARCH_WANT_SYS_NICE  #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME  #define __ARCH_WANT_SYS_OLDUMOUNT  #define __ARCH_WANT_SYS_SIGPENDING  #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h index 166adf61e770..2edb37637ead 100644 --- a/arch/x86/include/asm/visws/cobalt.h +++ b/arch/x86/include/asm/visws/cobalt.h @@ -122,4 +122,6 @@ extern char visws_board_type;  extern char visws_board_rev; +extern int pci_visws_init(void); +  #endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b4945419a84..fb9a080740ec 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -53,6 +53,7 @@   */  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002 +#define SECONDARY_EXEC_RDTSCP			0x00000008  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080 @@ -251,6 +252,7 @@ enum vmcs_field {  #define EXIT_REASON_MSR_READ            31  #define EXIT_REASON_MSR_WRITE           32  #define EXIT_REASON_MWAIT_INSTRUCTION   36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39  #define EXIT_REASON_PAUSE_INSTRUCTION   40  #define EXIT_REASON_MCE_DURING_VMENTRY	 41  #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 @@ -362,6 +364,7 @@ enum vmcs_field {  #define VMX_EPTP_UC_BIT				(1ull << 8)  #define VMX_EPTP_WB_BIT				(1ull << 14)  #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)  #define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)  #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)  #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26) @@ -374,7 +377,7 @@ enum vmcs_field {  #define VMX_EPT_READABLE_MASK			0x1ull  #define VMX_EPT_WRITABLE_MASK			0x2ull  #define VMX_EPT_EXECUTABLE_MASK			0x4ull -#define VMX_EPT_IGMT_BIT    			(1ull << 6) +#define VMX_EPT_IPAT_BIT    			(1ull << 6)  #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 60cc35269083..519b54327d75 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -99,6 +99,20 @@ struct x86_init_iommu {  };  /** + * struct x86_init_pci - platform specific pci init functions + * @arch_init:			platform specific pci arch init call + * @init:			platform specific pci subsystem init + * @init_irq:			platform specific pci irq init + * @fixup_irqs:			platform specific pci irq fixup + */ +struct x86_init_pci { +	int (*arch_init)(void); +	int (*init)(void); +	void (*init_irq)(void); +	void (*fixup_irqs)(void); +}; + +/**   * struct x86_init_ops - functions for platform specific setup   *   */ @@ -110,6 +124,7 @@ struct x86_init_ops {  	struct x86_init_paging		paging;  	struct x86_init_timers		timers;  	struct x86_init_iommu		iommu; +	struct x86_init_pci		pci;  };  /** diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a52..4c58352209e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_VM86)		+= vm86_32.o  obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o  obj-$(CONFIG_HPET_TIMER) 	+= hpet.o +obj-$(CONFIG_APB_TIMER)		+= apb_timer.o  obj-$(CONFIG_K8_NB)		+= k8.o  obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 738fcb60e708..0061ea263061 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@  #include <linux/ioport.h>  #include <linux/pci.h> +#include <asm/pci_x86.h>  #include <asm/pgtable.h>  #include <asm/io_apic.h>  #include <asm/apic.h> @@ -489,6 +490,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)   *  ACPI based hotplug support for CPU   */  #ifdef CONFIG_ACPI_HOTPLUG_CPU +#include <acpi/processor.h>  static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)  { @@ -566,6 +568,8 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)  		goto free_new_map;  	} +	acpi_processor_set_pdc(handle); +  	cpu = cpumask_first(new_map);  	acpi_map_cpu2node(handle, cpu, physid); @@ -1292,23 +1296,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)  }  /* - * Limit ACPI to CPU enumeration for HT - */ -static int __init force_acpi_ht(const struct dmi_system_id *d) -{ -	if (!acpi_force) { -		printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", -		       d->ident); -		disable_acpi(); -		acpi_ht = 1; -	} else { -		printk(KERN_NOTICE -		       "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); -	} -	return 0; -} - -/*   * Force ignoring BIOS IRQ0 pin2 override   */  static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) @@ -1344,82 +1331,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {  	 },  	/* -	 * Boxes that need acpi=ht -	 */ -	{ -	 .callback = force_acpi_ht, -	 .ident = "FSC Primergy T850", -	 .matches = { -		     DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), -		     DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "HP VISUALIZE NT Workstation", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), -		     DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "Compaq Workstation W8000", -	 .matches = { -		     DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), -		     DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "ASUS CUR-DLS", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -		     DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "ABIT i440BX-W83977", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), -		     DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "IBM Bladecenter", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -		     DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "IBM eServer xSeries 360", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -		     DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "IBM eserver xSeries 330", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -		     DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht, -	 .ident = "IBM eserver xSeries 440", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -		     DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), -		     }, -	 }, - -	/*  	 * Boxes that need ACPI PCI IRQ routing disabled  	 */  	{ @@ -1624,6 +1535,9 @@ int __init acpi_boot_init(void)  	acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); +	if (!acpi_noirq) +		x86_init.pci.init = pci_acpi_init; +  	return 0;  } @@ -1648,8 +1562,10 @@ static int __init parse_acpi(char *arg)  	}  	/* Limit ACPI just to boot-time to enable HT */  	else if (strcmp(arg, "ht") == 0) { -		if (!acpi_force) +		if (!acpi_force) { +			printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");  			disable_acpi(); +		}  		acpi_ht = 1;  	}  	/* acpi=rsdt use RSDT instead of XSDT */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e6ea0342c8f8..3a4bf35c179b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,7 @@  #include <linux/mm.h>  #include <linux/vmalloc.h>  #include <linux/memory.h> +#include <linux/stop_machine.h>  #include <asm/alternative.h>  #include <asm/sections.h>  #include <asm/pgtable.h> @@ -572,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)  	local_irq_restore(flags);  	return addr;  } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { +	void *addr; +	const void *opcode; +	size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ +	struct text_poke_params *tpp = data; + +	if (atomic_dec_and_test(&stop_machine_first)) { +		text_poke(tpp->addr, tpp->opcode, tpp->len); +		smp_wmb();	/* Make sure other cpus see that this has run */ +		wrote_text = 1; +	} else { +		while (!wrote_text) +			cpu_relax(); +		smp_mb();	/* Load wrote_text before following execution */ +	} + +	flush_icache_range((unsigned long)tpp->addr, +			   (unsigned long)tpp->addr + tpp->len); +	return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ +	struct text_poke_params tpp; + +	tpp.addr = addr; +	tpp.opcode = opcode; +	tpp.len = len; +	atomic_set(&stop_machine_first, 1); +	wrote_text = 0; +	stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); +	return addr; +} + diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 000000000000..4b7099526d2c --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,784 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + *  - timer 0 - NR_CPUs for per cpu timer + *  - one timer for clocksource + *  - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/sysdev.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/sfi.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/irq.h> + +#include <asm/fixmap.h> +#include <asm/apb_timer.h> + +#define APBT_MASK			CLOCKSOURCE_MASK(32) +#define APBT_SHIFT			22 +#define APBT_CLOCKEVENT_RATING		150 +#define APBT_CLOCKSOURCE_RATING		250 +#define APBT_MIN_DELTA_USEC		200 + +#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) +#define APBT_CLOCKEVENT0_NUM   (0) +#define APBT_CLOCKEVENT1_NUM   (1) +#define APBT_CLOCKSOURCE_NUM   (2) + +static unsigned long apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; +static int phy_cs_timer_id; + +/* + * Common DW APB timer info + */ +static uint64_t apbt_freq; + +static void apbt_set_mode(enum clock_event_mode mode, +			  struct clock_event_device *evt); +static int apbt_next_event(unsigned long delta, +			   struct clock_event_device *evt); +static cycle_t apbt_read_clocksource(struct clocksource *cs); +static void apbt_restart_clocksource(struct clocksource *cs); + +struct apbt_dev { +	struct clock_event_device evt; +	unsigned int num; +	int cpu; +	unsigned int irq; +	unsigned int tick; +	unsigned int count; +	unsigned int flags; +	char name[10]; +}; + +int disable_apbt_percpu __cpuinitdata; + +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +static struct apbt_dev *apbt_devs; +#endif + +static	inline unsigned long apbt_readl_reg(unsigned long a) +{ +	return readl(apbt_virt_address + a); +} + +static inline void apbt_writel_reg(unsigned long d, unsigned long a) +{ +	writel(d, apbt_virt_address + a); +} + +static inline unsigned long apbt_readl(int n, unsigned long a) +{ +	return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_writel(int n, unsigned long d, unsigned long a) +{ +	writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_set_mapping(void) +{ +	struct sfi_timer_table_entry *mtmr; + +	if (apbt_virt_address) { +		pr_debug("APBT base already mapped\n"); +		return; +	} +	mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); +	if (mtmr == NULL) { +		printk(KERN_ERR "Failed to get MTMR %d from SFI\n", +		       APBT_CLOCKEVENT0_NUM); +		return; +	} +	apbt_address = (unsigned long)mtmr->phys_addr; +	if (!apbt_address) { +		printk(KERN_WARNING "No timer base from SFI, use default\n"); +		apbt_address = APBT_DEFAULT_BASE; +	} +	apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); +	if (apbt_virt_address) { +		pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ +			 (void *)apbt_address, (void *)apbt_virt_address); +	} else { +		pr_debug("Failed mapping APBT phy address at %p\n",\ +			 (void *)apbt_address); +		goto panic_noapbt; +	} +	apbt_freq = mtmr->freq_hz / USEC_PER_SEC; +	sfi_free_mtmr(mtmr); + +	/* Now figure out the physical timer id for clocksource device */ +	mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); +	if (mtmr == NULL) +		goto panic_noapbt; + +	/* Now figure out the physical timer id */ +	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) +		/ APBTMRS_REG_SIZE; +	pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); +	return; + +panic_noapbt: +	panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ +	iounmap(apbt_virt_address); +	apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ +	return apbt_virt_address ? 1 : 0; +} + +static struct clocksource clocksource_apbt = { +	.name		= "apbt", +	.rating		= APBT_CLOCKSOURCE_RATING, +	.read		= apbt_read_clocksource, +	.mask		= APBT_MASK, +	.shift		= APBT_SHIFT, +	.flags		= CLOCK_SOURCE_IS_CONTINUOUS, +	.resume		= apbt_restart_clocksource, +}; + +/* boot APB clock event device */ +static struct clock_event_device apbt_clockevent = { +	.name		= "apbt0", +	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, +	.set_mode	= apbt_set_mode, +	.set_next_event = apbt_next_event, +	.shift		= APBT_SHIFT, +	.irq		= 0, +	.rating		= APBT_CLOCKEVENT_RATING, +}; + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ +	if (!arg) +		return -EINVAL; + +	if (strcmp("apbt_only", arg) == 0) +		disable_apbt_percpu = 0; +	else if (strcmp("lapic_and_apbt", arg) == 0) +		disable_apbt_percpu = 1; +	else { +		pr_warning("X86 MRST timer option %s not recognised" +			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", +			   arg); +		return -EINVAL; +	} +	return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * start count down from 0xffff_ffff. this is done by toggling the enable bit + * then load initial load count to ~0. + */ +static void apbt_start_counter(int n) +{ +	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + +	ctrl &= ~APBTMR_CONTROL_ENABLE; +	apbt_writel(n, ctrl, APBTMR_N_CONTROL); +	apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); +	/* enable, mask interrupt */ +	ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; +	ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); +	apbt_writel(n, ctrl, APBTMR_N_CONTROL); +	/* read it once to get cached counter value initialized */ +	apbt_read_clocksource(&clocksource_apbt); +} + +static irqreturn_t apbt_interrupt_handler(int irq, void *data) +{ +	struct apbt_dev *dev = (struct apbt_dev *)data; +	struct clock_event_device *aevt = &dev->evt; + +	if (!aevt->event_handler) { +		printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", +		       dev->num); +		return IRQ_NONE; +	} +	aevt->event_handler(aevt); +	return IRQ_HANDLED; +} + +static void apbt_restart_clocksource(struct clocksource *cs) +{ +	apbt_start_counter(phy_cs_timer_id); +} + +/* Setup IRQ routing via IOAPIC */ +#ifdef CONFIG_SMP +static void apbt_setup_irq(struct apbt_dev *adev) +{ +	struct irq_chip *chip; +	struct irq_desc *desc; + +	/* timer0 irq has been setup early */ +	if (adev->irq == 0) +		return; +	desc = irq_to_desc(adev->irq); +	chip = get_irq_chip(adev->irq); +	disable_irq(adev->irq); +	desc->status |= IRQ_MOVE_PCNTXT; +	irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); +	/* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ +	set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); +	enable_irq(adev->irq); +	if (system_state == SYSTEM_BOOTING) +		if (request_irq(adev->irq, apbt_interrupt_handler, +				IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, +				adev->name, adev)) { +			printk(KERN_ERR "Failed request IRQ for APBT%d\n", +			       adev->num); +		} +} +#endif + +static void apbt_enable_int(int n) +{ +	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); +	/* clear pending intr */ +	apbt_readl(n, APBTMR_N_EOI); +	ctrl &= ~APBTMR_CONTROL_INT; +	apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + +static void apbt_disable_int(int n) +{ +	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + +	ctrl |= APBTMR_CONTROL_INT; +	apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + + +static int __init apbt_clockevent_register(void) +{ +	struct sfi_timer_table_entry *mtmr; +	struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + +	mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); +	if (mtmr == NULL) { +		printk(KERN_ERR "Failed to get MTMR %d from SFI\n", +		       APBT_CLOCKEVENT0_NUM); +		return -ENODEV; +	} + +	/* +	 * We need to calculate the scaled math multiplication factor for +	 * nanosecond to apbt tick conversion. +	 * mult = (nsec/cycle)*2^APBT_SHIFT +	 */ +	apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz +				      , NSEC_PER_SEC, APBT_SHIFT); + +	/* Calculate the min / max delta */ +	apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, +							   &apbt_clockevent); +	apbt_clockevent.min_delta_ns = clockevent_delta2ns( +		APBT_MIN_DELTA_USEC*apbt_freq, +		&apbt_clockevent); +	/* +	 * Start apbt with the boot cpu mask and make it +	 * global if not used for per cpu timer. +	 */ +	apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); +	adev->num = smp_processor_id(); +	memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + +	if (disable_apbt_percpu) { +		apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; +		global_clock_event = &adev->evt; +		printk(KERN_DEBUG "%s clockevent registered as global\n", +		       global_clock_event->name); +	} + +	if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, +			IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, +			apbt_clockevent.name, adev)) { +		printk(KERN_ERR "Failed request IRQ for APBT%d\n", +		       apbt_clockevent.irq); +	} + +	clockevents_register_device(&adev->evt); +	/* Start APBT 0 interrupts */ +	apbt_enable_int(APBT_CLOCKEVENT0_NUM); + +	sfi_free_mtmr(mtmr); +	return 0; +} + +#ifdef CONFIG_SMP +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ +	struct apbt_dev *adev; +	struct clock_event_device *aevt; +	int cpu; + +	/* Don't register boot CPU clockevent */ +	cpu = smp_processor_id(); +	if (cpu == boot_cpu_id) +		return; +	/* +	 * We need to calculate the scaled math multiplication factor for +	 * nanosecond to apbt tick conversion. +	 * mult = (nsec/cycle)*2^APBT_SHIFT +	 */ +	printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); +	adev = &per_cpu(cpu_apbt_dev, cpu); +	aevt = &adev->evt; + +	memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); +	aevt->cpumask = cpumask_of(cpu); +	aevt->name = adev->name; +	aevt->mode = CLOCK_EVT_MODE_UNUSED; + +	printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", +	       cpu, aevt->name, *(u32 *)aevt->cpumask); + +	apbt_setup_irq(adev); + +	clockevents_register_device(aevt); + +	apbt_enable_int(cpu); + +	return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, +			     unsigned long action, void *hcpu) +{ +	unsigned long cpu = (unsigned long)hcpu; +	struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + +	switch (action & 0xf) { +	case CPU_DEAD: +		apbt_disable_int(cpu); +		if (system_state == SYSTEM_RUNNING) +			pr_debug("skipping APBT CPU %lu offline\n", cpu); +		else if (adev) { +			pr_debug("APBT clockevent for cpu %lu offline\n", cpu); +			free_irq(adev->irq, adev); +		} +		break; +	default: +		pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); +	} +	return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ +	if (disable_apbt_percpu) +		return 0; +	/* This notifier should be called after workqueue is ready */ +	hotcpu_notifier(apbt_cpuhp_notify, -20); +	return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static void apbt_set_mode(enum clock_event_mode mode, +			  struct clock_event_device *evt) +{ +	unsigned long ctrl; +	uint64_t delta; +	int timer_num; +	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + +	timer_num = adev->num; +	pr_debug("%s CPU %d timer %d mode=%d\n", +		 __func__, first_cpu(*evt->cpumask), timer_num, mode); + +	switch (mode) { +	case CLOCK_EVT_MODE_PERIODIC: +		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; +		delta >>= apbt_clockevent.shift; +		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); +		ctrl |= APBTMR_CONTROL_MODE_PERIODIC; +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +		/* +		 * DW APB p. 46, have to disable timer before load counter, +		 * may cause sync problem. +		 */ +		ctrl &= ~APBTMR_CONTROL_ENABLE; +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +		udelay(1); +		pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); +		apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); +		ctrl |= APBTMR_CONTROL_ENABLE; +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +		break; +		/* APB timer does not have one-shot mode, use free running mode */ +	case CLOCK_EVT_MODE_ONESHOT: +		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); +		/* +		 * set free running mode, this mode will let timer reload max +		 * timeout which will give time (3min on 25MHz clock) to rearm +		 * the next event, therefore emulate the one-shot mode. +		 */ +		ctrl &= ~APBTMR_CONTROL_ENABLE; +		ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +		/* write again to set free running mode */ +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + +		/* +		 * DW APB p. 46, load counter with all 1s before starting free +		 * running mode. +		 */ +		apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); +		ctrl &= ~APBTMR_CONTROL_INT; +		ctrl |= APBTMR_CONTROL_ENABLE; +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +		break; + +	case CLOCK_EVT_MODE_UNUSED: +	case CLOCK_EVT_MODE_SHUTDOWN: +		apbt_disable_int(timer_num); +		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); +		ctrl &= ~APBTMR_CONTROL_ENABLE; +		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +		break; + +	case CLOCK_EVT_MODE_RESUME: +		apbt_enable_int(timer_num); +		break; +	} +} + +static int apbt_next_event(unsigned long delta, +			   struct clock_event_device *evt) +{ +	unsigned long ctrl; +	int timer_num; + +	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + +	timer_num = adev->num; +	/* Disable timer */ +	ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); +	ctrl &= ~APBTMR_CONTROL_ENABLE; +	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +	/* write new count */ +	apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); +	ctrl |= APBTMR_CONTROL_ENABLE; +	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); +	return 0; +} + +/* + * APB timer clock is not in sync with pclk on Langwell, which translates to + * unreliable read value caused by sampling error. the error does not add up + * overtime and only happens when sampling a 0 as a 1 by mistake. so the time + * would go backwards. the following code is trying to prevent time traveling + * backwards. little bit paranoid. + */ +static cycle_t apbt_read_clocksource(struct clocksource *cs) +{ +	unsigned long t0, t1, t2; +	static unsigned long last_read; + +bad_count: +	t1 = apbt_readl(phy_cs_timer_id, +			APBTMR_N_CURRENT_VALUE); +	t2 = apbt_readl(phy_cs_timer_id, +			APBTMR_N_CURRENT_VALUE); +	if (unlikely(t1 < t2)) { +		pr_debug("APBT: read current count error %lx:%lx:%lx\n", +			 t1, t2, t2 - t1); +		goto bad_count; +	} +	/* +	 * check against cached last read, makes sure time does not go back. +	 * it could be a normal rollover but we will do tripple check anyway +	 */ +	if (unlikely(t2 > last_read)) { +		/* check if we have a normal rollover */ +		unsigned long raw_intr_status = +			apbt_readl_reg(APBTMRS_RAW_INT_STATUS); +		/* +		 * cs timer interrupt is masked but raw intr bit is set if +		 * rollover occurs. then we read EOI reg to clear it. +		 */ +		if (raw_intr_status & (1 << phy_cs_timer_id)) { +			apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); +			goto out; +		} +		pr_debug("APB CS going back %lx:%lx:%lx ", +			 t2, last_read, t2 - last_read); +bad_count_x3: +		pr_debug(KERN_INFO "tripple check enforced\n"); +		t0 = apbt_readl(phy_cs_timer_id, +				APBTMR_N_CURRENT_VALUE); +		udelay(1); +		t1 = apbt_readl(phy_cs_timer_id, +				APBTMR_N_CURRENT_VALUE); +		udelay(1); +		t2 = apbt_readl(phy_cs_timer_id, +				APBTMR_N_CURRENT_VALUE); +		if ((t2 > t1) || (t1 > t0)) { +			printk(KERN_ERR "Error: APB CS tripple check failed\n"); +			goto bad_count_x3; +		} +	} +out: +	last_read = t2; +	return (cycle_t)~t2; +} + +static int apbt_clocksource_register(void) +{ +	u64 start, now; +	cycle_t t1; + +	/* Start the counter, use timer 2 as source, timer 0/1 for event */ +	apbt_start_counter(phy_cs_timer_id); + +	/* Verify whether apbt counter works */ +	t1 = apbt_read_clocksource(&clocksource_apbt); +	rdtscll(start); + +	/* +	 * We don't know the TSC frequency yet, but waiting for +	 * 200000 TSC cycles is safe: +	 * 4 GHz == 50us +	 * 1 GHz == 200us +	 */ +	do { +		rep_nop(); +		rdtscll(now); +	} while ((now - start) < 200000UL); + +	/* APBT is the only always on clocksource, it has to work! */ +	if (t1 == apbt_read_clocksource(&clocksource_apbt)) +		panic("APBT counter not counting. APBT disabled\n"); + +	/* +	 * initialize and register APBT clocksource +	 * convert that to ns/clock cycle +	 * mult = (ns/c) * 2^APBT_SHIFT +	 */ +	clocksource_apbt.mult = div_sc(MSEC_PER_SEC, +				       (unsigned long) apbt_freq, APBT_SHIFT); +	clocksource_register(&clocksource_apbt); + +	return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP +	int i; +	struct sfi_timer_table_entry *p_mtmr; +	unsigned int percpu_timer; +	struct apbt_dev *adev; +#endif + +	if (apb_timer_block_enabled) +		return; +	apbt_set_mapping(); +	if (apbt_virt_address) { +		pr_debug("Found APBT version 0x%lx\n",\ +			 apbt_readl_reg(APBTMRS_COMP_VERSION)); +	} else +		goto out_noapbt; +	/* +	 * Read the frequency and check for a sane value, for ESL model +	 * we extend the possible clock range to allow time scaling. +	 */ + +	if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { +		pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); +		goto out_noapbt; +	} +	if (apbt_clocksource_register()) { +		pr_debug("APBT has failed to register clocksource\n"); +		goto out_noapbt; +	} +	if (!apbt_clockevent_register()) +		apb_timer_block_enabled = 1; +	else { +		pr_debug("APBT has failed to register clockevent\n"); +		goto out_noapbt; +	} +#ifdef CONFIG_SMP +	/* kernel cmdline disable apb timer, so we will use lapic timers */ +	if (disable_apbt_percpu) { +		printk(KERN_INFO "apbt: disabled per cpu timer\n"); +		return; +	} +	pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); +	if (num_possible_cpus() <= sfi_mtimer_num) { +		percpu_timer = 1; +		apbt_num_timers_used = num_possible_cpus(); +	} else { +		percpu_timer = 0; +		apbt_num_timers_used = 1; +		adev = &per_cpu(cpu_apbt_dev, 0); +		adev->flags &= ~APBT_DEV_USED; +	} +	pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + +	/* here we set up per CPU timer data structure */ +	apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, +			    GFP_KERNEL); +	if (!apbt_devs) { +		printk(KERN_ERR "Failed to allocate APB timer devices\n"); +		return; +	} +	for (i = 0; i < apbt_num_timers_used; i++) { +		adev = &per_cpu(cpu_apbt_dev, i); +		adev->num = i; +		adev->cpu = i; +		p_mtmr = sfi_get_mtmr(i); +		if (p_mtmr) { +			adev->tick = p_mtmr->freq_hz; +			adev->irq = p_mtmr->irq; +		} else +			printk(KERN_ERR "Failed to get timer for cpu %d\n", i); +		adev->count = 0; +		sprintf(adev->name, "apbt%d", i); +	} +#endif + +	return; + +out_noapbt: +	apbt_clear_mapping(); +	apb_timer_block_enabled = 0; +	panic("failed to enable APB timer\n"); +} + +static inline void apbt_disable(int n) +{ +	if (is_apbt_capable()) { +		unsigned long ctrl =  apbt_readl(n, APBTMR_N_CONTROL); +		ctrl &= ~APBTMR_CONTROL_ENABLE; +		apbt_writel(n, ctrl, APBTMR_N_CONTROL); +	} +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate() +{ +	int i, scale; +	u64 old, new; +	cycle_t t1, t2; +	unsigned long khz = 0; +	u32 loop, shift; + +	apbt_set_mapping(); +	apbt_start_counter(phy_cs_timer_id); + +	/* check if the timer can count down, otherwise return */ +	old = apbt_read_clocksource(&clocksource_apbt); +	i = 10000; +	while (--i) { +		if (old != apbt_read_clocksource(&clocksource_apbt)) +			break; +	} +	if (!i) +		goto failed; + +	/* count 16 ms */ +	loop = (apbt_freq * 1000) << 4; + +	/* restart the timer to ensure it won't get to 0 in the calibration */ +	apbt_start_counter(phy_cs_timer_id); + +	old = apbt_read_clocksource(&clocksource_apbt); +	old += loop; + +	t1 = __native_read_tsc(); + +	do { +		new = apbt_read_clocksource(&clocksource_apbt); +	} while (new < old); + +	t2 = __native_read_tsc(); + +	shift = 5; +	if (unlikely(loop >> shift == 0)) { +		printk(KERN_INFO +		       "APBT TSC calibration failed, not enough resolution\n"); +		return 0; +	} +	scale = (int)div_u64((t2 - t1), loop >> shift); +	khz = (scale * apbt_freq * 1000) >> shift; +	printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); +	return khz; +failed: +	return 0; +} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f147a95fd84a..3704997e8b25 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,7 +31,6 @@  #include <asm/x86_init.h>  int gart_iommu_aperture; -EXPORT_SYMBOL_GPL(gart_iommu_aperture);  int gart_iommu_aperture_disabled __initdata;  int gart_iommu_aperture_allowed __initdata; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6e29b2a77aa8..00187f1fcfb7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void)  	}  	local_irq_save(flags); -	mask_8259A(); +	legacy_pic->mask_all();  	mask_IO_APIC_setup(ioapic_entries);  	if (dmar_table_init_ret) @@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void)  nox2apic:  	if (!ret) /* IR enabling failed */  		restore_IO_APIC_setup(ioapic_entries); -	unmask_8259A(); +	legacy_pic->restore_mask();  	local_irq_restore(flags);  out: @@ -2018,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev)  		}  		mask_IO_APIC_setup(ioapic_entries); -		mask_8259A(); +		legacy_pic->mask_all();  	}  	if (x2apic_mode) @@ -2062,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev)  	if (intr_remapping_enabled) {  		reenable_intr_remapping(x2apic_mode); -		unmask_8259A(); +		legacy_pic->restore_mask();  		restore_IO_APIC_setup(ioapic_entries);  		free_ioapic_entries(ioapic_entries);  	} diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index e3c3d820c325..09d3b17ce0c2 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -223,7 +223,7 @@ struct apic apic_flat =  {  };  /* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * Physflat mode is used when there are more than 8 CPUs on a system.   * We cannot use logical delivery in this case because the mask   * overflows, so use physical mode.   */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 14862f11cc4a..463de9a858ad 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -143,12 +143,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];  static struct irq_cfg irq_cfgx[NR_IRQS];  #endif -void __init io_apic_disable_legacy(void) -{ -	nr_legacy_irqs = 0; -	nr_irqs_gsi = 0; -} -  int __init arch_early_irq_init(void)  {  	struct irq_cfg *cfg; @@ -157,6 +151,11 @@ int __init arch_early_irq_init(void)  	int node;  	int i; +	if (!legacy_pic->nr_legacy_irqs) { +		nr_irqs_gsi = 0; +		io_apic_irqs = ~0UL; +	} +  	cfg = irq_cfgx;  	count = ARRAY_SIZE(irq_cfgx);  	node= cpu_to_node(boot_cpu_id); @@ -170,7 +169,7 @@ int __init arch_early_irq_init(void)  		 * For legacy IRQ's, start with assigning irq0 to irq15 to  		 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.  		 */ -		if (i < nr_legacy_irqs) { +		if (i < legacy_pic->nr_legacy_irqs) {  			cfg[i].vector = IRQ0_VECTOR + i;  			cpumask_set_cpu(0, cfg[i].domain);  		} @@ -852,7 +851,7 @@ static int __init find_isa_irq_apic(int irq, int type)   */  static int EISA_ELCR(unsigned int irq)  { -	if (irq < nr_legacy_irqs) { +	if (irq < legacy_pic->nr_legacy_irqs) {  		unsigned int port = 0x4d0 + (irq >> 3);  		return (inb(port) >> (irq & 7)) & 1;  	} @@ -1269,6 +1268,14 @@ void __setup_vector_irq(int cpu)  	/* Mark the inuse vectors */  	for_each_irq_desc(irq, desc) {  		cfg = desc->chip_data; + +		/* +		 * If it is a legacy IRQ handled by the legacy PIC, this cpu +		 * will be part of the irq_cfg's domain. +		 */ +		if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) +			cpumask_set_cpu(cpu, cfg->domain); +  		if (!cpumask_test_cpu(cpu, cfg->domain))  			continue;  		vector = cfg->vector; @@ -1439,7 +1446,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq  	 * controllers like 8259. Now that IO-APIC can handle this irq, update  	 * the cfg->domain.  	 */ -	if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) +	if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))  		apic->vector_allocation_domain(0, cfg->domain);  	if (assign_irq_vector(irq, cfg, apic->target_cpus())) @@ -1463,8 +1470,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq  	}  	ioapic_register_intr(irq, desc, trigger); -	if (irq < nr_legacy_irqs) -		disable_8259A_irq(irq); +	if (irq < legacy_pic->nr_legacy_irqs) +		legacy_pic->chip->mask(irq);  	ioapic_write_entry(apic_id, pin, entry);  } @@ -1873,7 +1880,7 @@ __apicdebuginit(void) print_PIC(void)  	unsigned int v;  	unsigned long flags; -	if (!nr_legacy_irqs) +	if (!legacy_pic->nr_legacy_irqs)  		return;  	printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1957,7 +1964,7 @@ void __init enable_IO_APIC(void)  		nr_ioapic_registers[apic] = reg_01.bits.entries+1;  	} -	if (!nr_legacy_irqs) +	if (!legacy_pic->nr_legacy_irqs)  		return;  	for(apic = 0; apic < nr_ioapics; apic++) { @@ -2014,7 +2021,7 @@ void disable_IO_APIC(void)  	 */  	clear_IO_APIC(); -	if (!nr_legacy_irqs) +	if (!legacy_pic->nr_legacy_irqs)  		return;  	/* @@ -2247,9 +2254,9 @@ static unsigned int startup_ioapic_irq(unsigned int irq)  	struct irq_cfg *cfg;  	raw_spin_lock_irqsave(&ioapic_lock, flags); -	if (irq < nr_legacy_irqs) { -		disable_8259A_irq(irq); -		if (i8259A_irq_pending(irq)) +	if (irq < legacy_pic->nr_legacy_irqs) { +		legacy_pic->chip->mask(irq); +		if (legacy_pic->irq_pending(irq))  			was_pending = 1;  	}  	cfg = irq_cfg(irq); @@ -2782,8 +2789,8 @@ static inline void init_IO_APIC_traps(void)  			 * so default to an old-fashioned 8259  			 * interrupt if we can..  			 */ -			if (irq < nr_legacy_irqs) -				make_8259A_irq(irq); +			if (irq < legacy_pic->nr_legacy_irqs) +				legacy_pic->make_irq(irq);  			else  				/* Strange. Oh, well.. */  				desc->chip = &no_irq_chip; @@ -2940,7 +2947,7 @@ static inline void __init check_timer(void)  	/*  	 * get/set the timer IRQ vector:  	 */ -	disable_8259A_irq(0); +	legacy_pic->chip->mask(0);  	assign_irq_vector(0, cfg, apic->target_cpus());  	/* @@ -2953,7 +2960,7 @@ static inline void __init check_timer(void)  	 * automatically.  	 */  	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); -	init_8259A(1); +	legacy_pic->init(1);  #ifdef CONFIG_X86_32  	{  		unsigned int ver; @@ -3012,7 +3019,7 @@ static inline void __init check_timer(void)  		if (timer_irq_works()) {  			if (nmi_watchdog == NMI_IO_APIC) {  				setup_nmi(); -				enable_8259A_irq(0); +				legacy_pic->chip->unmask(0);  			}  			if (disable_timer_pin_1 > 0)  				clear_IO_APIC_pin(0, pin1); @@ -3035,14 +3042,14 @@ static inline void __init check_timer(void)  		 */  		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);  		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); -		enable_8259A_irq(0); +		legacy_pic->chip->unmask(0);  		if (timer_irq_works()) {  			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");  			timer_through_8259 = 1;  			if (nmi_watchdog == NMI_IO_APIC) { -				disable_8259A_irq(0); +				legacy_pic->chip->mask(0);  				setup_nmi(); -				enable_8259A_irq(0); +				legacy_pic->chip->unmask(0);  			}  			goto out;  		} @@ -3050,7 +3057,7 @@ static inline void __init check_timer(void)  		 * Cleanup, just in case ...  		 */  		local_irq_disable(); -		disable_8259A_irq(0); +		legacy_pic->chip->mask(0);  		clear_IO_APIC_pin(apic2, pin2);  		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");  	} @@ -3069,22 +3076,22 @@ static inline void __init check_timer(void)  	lapic_register_intr(0, desc);  	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */ -	enable_8259A_irq(0); +	legacy_pic->chip->unmask(0);  	if (timer_irq_works()) {  		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");  		goto out;  	}  	local_irq_disable(); -	disable_8259A_irq(0); +	legacy_pic->chip->mask(0);  	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);  	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");  	apic_printk(APIC_QUIET, KERN_INFO  		    "...trying to set up timer as ExtINT IRQ...\n"); -	init_8259A(0); -	make_8259A_irq(0); +	legacy_pic->init(0); +	legacy_pic->make_irq(0);  	apic_write(APIC_LVT0, APIC_DM_EXTINT);  	unlock_ExtINT_logic(); @@ -3126,7 +3133,7 @@ void __init setup_IO_APIC(void)  	/*  	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP  	 */ -	io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; +	io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;  	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");  	/* @@ -3137,7 +3144,7 @@ void __init setup_IO_APIC(void)  	sync_Arb_IDs();  	setup_IO_APIC_irqs();  	init_IO_APIC_traps(); -	if (nr_legacy_irqs) +	if (legacy_pic->nr_legacy_irqs)  		check_timer();  } @@ -3928,7 +3935,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,  	/*  	 * IRQs < 16 are already in the irq_2_pin[] map  	 */ -	if (irq >= nr_legacy_irqs) { +	if (irq >= legacy_pic->nr_legacy_irqs) {  		cfg = desc->chip_data;  		if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {  			printk(KERN_INFO "can not add pin %d for irq %d\n", @@ -4302,3 +4309,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)  	nr_ioapics++;  } + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ +	struct irq_cfg *cfg; +	struct irq_desc *desc; + +	printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP +	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); +#endif +	desc = irq_to_desc_alloc_node(0, 0); + +	setup_local_APIC(); + +	cfg = irq_cfg(0); +	add_pin_to_irq_node(cfg, 0, 0, 0); +	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + +	setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); +} diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index bd7c96b5e8d8..8aa65adbd25d 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -177,7 +177,7 @@ int __init check_nmi_watchdog(void)  error:  	if (nmi_watchdog == NMI_IO_APIC) {  		if (!timer_through_8259) -			disable_8259A_irq(0); +			legacy_pic->chip->mask(0);  		on_each_cpu(__acpi_nmi_disable, NULL, 1);  	} diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 47dd856708e5..3e28401f161c 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -277,6 +277,7 @@ static __init void early_check_numaq(void)  		x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;  		x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;  		x86_init.timers.tsc_pre_init = numaq_tsc_init; +		x86_init.pci.init = pci_numaq_init;  	}  } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3740c8a4eae7..49dbeaef2a27 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -120,11 +120,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);  unsigned long sn_rtc_cycles_per_second;  EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */ -  static const struct cpumask *uv_target_cpus(void)  { -	return cpumask_of(0); +	return cpu_online_mask;  }  static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c389b9..870e6cc6ad28 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ  comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ +	tristate "Processor Clocking Control interface driver" +	depends on ACPI && ACPI_PROCESSOR +	help +	  This driver adds support for the PCC interface. + +	  For details, take a look at: +	  <file:Documentation/cpu-freq/pcc-cpufreq.txt>. + +	  To compile this driver as a module, choose M here: the +	  module will be called pcc-cpufreq. + +	  If in doubt, say N. +  config X86_ACPI_CPUFREQ  	tristate "ACPI Processor P-States driver"  	select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296df294d..1840c0a5170b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@  obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o  obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o  obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o  obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o  obj-$(CONFIG_X86_LONGHAUL)		+= longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 000000000000..ff36d2979a90 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,620 @@ +/* + *  pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + *  Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> + *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + *	Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License as published by + *  the Free Software Foundation; version 2 of the License. + * + *  This program is distributed in the hope that it will be useful, but + *  WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + *  INFRINGEMENT. See the GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License along + *  with this program; if not, write to the Free Software Foundation, Inc., + *  675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/cpufreq.h> +#include <linux/compiler.h> + +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> + +#include <acpi/processor.h> + +#define PCC_VERSION 	"1.00.00" +#define POLL_LOOPS 	300 + +#define CMD_COMPLETE 	0x1 +#define CMD_GET_FREQ 	0x0 +#define CMD_SET_FREQ 	0x1 + +#define BUF_SZ		4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER,	\ +					     "pcc-cpufreq", msg) + +struct pcc_register_resource { +	u8 descriptor; +	u16 length; +	u8 space_id; +	u8 bit_width; +	u8 bit_offset; +	u8 access_size; +	u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { +	u8 descriptor; +	u16 length; +	u8 space_id; +	u8 resource_usage; +	u8 type_specific; +	u64 granularity; +	u64 minimum; +	u64 maximum; +	u64 translation_offset; +	u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { +	u32 signature; +	u16 length; +	u8 major; +	u8 minor; +	u32 features; +	u16 command; +	u16 status; +	u32 latency; +	u32 minimum_time; +	u32 maximum_time; +	u32 nominal; +	u32 throttled_frequency; +	u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, +			  0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { +	u32 input_offset; +	u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ +	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, +				     policy->cpuinfo.max_freq); +	return 0; +} + +static inline void pcc_cmd(void) +{ +	u64 doorbell_value; +	int i; + +	acpi_read(&doorbell_value, &doorbell); +	acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, +		   &doorbell); + +	for (i = 0; i < POLL_LOOPS; i++) { +		if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) +			break; +	} +} + +static inline void pcc_clear_mapping(void) +{ +	if (pcch_virt_addr) +		iounmap(pcch_virt_addr); +	pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ +	struct pcc_cpu *pcc_cpu_data; +	unsigned int curr_freq; +	unsigned int freq_limit; +	u16 status; +	u32 input_buffer; +	u32 output_buffer; + +	spin_lock(&pcc_lock); + +	dprintk("get: get_freq for CPU %d\n", cpu); +	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + +	input_buffer = 0x1; +	iowrite32(input_buffer, +			(pcch_virt_addr + pcc_cpu_data->input_offset)); +	iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + +	pcc_cmd(); + +	output_buffer = +		ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + +	/* Clear the input buffer - we are done with the current command */ +	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + +	status = ioread16(&pcch_hdr->status); +	if (status != CMD_COMPLETE) { +		dprintk("get: FAILED: for CPU %d, status is %d\n", +			cpu, status); +		goto cmd_incomplete; +	} +	iowrite16(0, &pcch_hdr->status); +	curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) +			/ 100) * 1000); + +	dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " +		"0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", +		cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), +		output_buffer, curr_freq); + +	freq_limit = (output_buffer >> 8) & 0xff; +	if (freq_limit != 0xff) { +		dprintk("get: frequency for cpu %d is being temporarily" +			" capped at %d\n", cpu, curr_freq); +	} + +	spin_unlock(&pcc_lock); +	return curr_freq; + +cmd_incomplete: +	iowrite16(0, &pcch_hdr->status); +	spin_unlock(&pcc_lock); +	return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, +			      unsigned int target_freq, +			      unsigned int relation) +{ +	struct pcc_cpu *pcc_cpu_data; +	struct cpufreq_freqs freqs; +	u16 status; +	u32 input_buffer; +	int cpu; + +	spin_lock(&pcc_lock); +	cpu = policy->cpu; +	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + +	dprintk("target: CPU %d should go to target freq: %d " +		"(virtual) input_offset is 0x%x\n", +		cpu, target_freq, +		(pcch_virt_addr + pcc_cpu_data->input_offset)); + +	freqs.new = target_freq; +	freqs.cpu = cpu; +	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + +	input_buffer = 0x1 | (((target_freq * 100) +			       / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); +	iowrite32(input_buffer, +			(pcch_virt_addr + pcc_cpu_data->input_offset)); +	iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + +	pcc_cmd(); + +	/* Clear the input buffer - we are done with the current command */ +	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + +	status = ioread16(&pcch_hdr->status); +	if (status != CMD_COMPLETE) { +		dprintk("target: FAILED for cpu %d, with status: 0x%x\n", +			cpu, status); +		goto cmd_incomplete; +	} +	iowrite16(0, &pcch_hdr->status); + +	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +	dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); +	spin_unlock(&pcc_lock); + +	return 0; + +cmd_incomplete: +	iowrite16(0, &pcch_hdr->status); +	spin_unlock(&pcc_lock); +	return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ +	acpi_status status; +	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; +	union acpi_object *pccp, *offset; +	struct pcc_cpu *pcc_cpu_data; +	struct acpi_processor *pr; +	int ret = 0; + +	pr = per_cpu(processors, cpu); +	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + +	status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); +	if (ACPI_FAILURE(status)) +		return -ENODEV; + +	pccp = buffer.pointer; +	if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { +		ret = -ENODEV; +		goto out_free; +	}; + +	offset = &(pccp->package.elements[0]); +	if (!offset || offset->type != ACPI_TYPE_INTEGER) { +		ret = -ENODEV; +		goto out_free; +	} + +	pcc_cpu_data->input_offset = offset->integer.value; + +	offset = &(pccp->package.elements[1]); +	if (!offset || offset->type != ACPI_TYPE_INTEGER) { +		ret = -ENODEV; +		goto out_free; +	} + +	pcc_cpu_data->output_offset = offset->integer.value; + +	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); +	memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + +	dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " +		"input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", +		cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: +	kfree(buffer.pointer); +	return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ +	acpi_status status; +	struct acpi_object_list input; +	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; +	union acpi_object in_params[4]; +	union acpi_object *out_obj; +	u32 capabilities[2]; +	u32 errors; +	u32 supported; +	int ret = 0; + +	input.count = 4; +	input.pointer = in_params; +	input.count = 4; +	input.pointer = in_params; +	in_params[0].type               = ACPI_TYPE_BUFFER; +	in_params[0].buffer.length      = 16; +	in_params[0].buffer.pointer     = OSC_UUID; +	in_params[1].type               = ACPI_TYPE_INTEGER; +	in_params[1].integer.value      = 1; +	in_params[2].type               = ACPI_TYPE_INTEGER; +	in_params[2].integer.value      = 2; +	in_params[3].type               = ACPI_TYPE_BUFFER; +	in_params[3].buffer.length      = 8; +	in_params[3].buffer.pointer     = (u8 *)&capabilities; + +	capabilities[0] = OSC_QUERY_ENABLE; +	capabilities[1] = 0x1; + +	status = acpi_evaluate_object(*handle, "_OSC", &input, &output); +	if (ACPI_FAILURE(status)) +		return -ENODEV; + +	if (!output.length) +		return -ENODEV; + +	out_obj = output.pointer; +	if (out_obj->type != ACPI_TYPE_BUFFER) { +		ret = -ENODEV; +		goto out_free; +	} + +	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); +	if (errors) { +		ret = -ENODEV; +		goto out_free; +	} + +	supported = *((u32 *)(out_obj->buffer.pointer + 4)); +	if (!(supported & 0x1)) { +		ret = -ENODEV; +		goto out_free; +	} + +	kfree(output.pointer); +	capabilities[0] = 0x0; +	capabilities[1] = 0x1; + +	status = acpi_evaluate_object(*handle, "_OSC", &input, &output); +	if (ACPI_FAILURE(status)) +		return -ENODEV; + +	if (!output.length) +		return -ENODEV; + +	out_obj = output.pointer; +	if (out_obj->type != ACPI_TYPE_BUFFER) { +		ret = -ENODEV; +		goto out_free; +	} + +	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); +	if (errors) { +		ret = -ENODEV; +		goto out_free; +	} + +	supported = *((u32 *)(out_obj->buffer.pointer + 4)); +	if (!(supported & 0x1)) { +		ret = -ENODEV; +		goto out_free; +	} + +out_free: +	kfree(output.pointer); +	return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ +	acpi_status status; +	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; +	struct pcc_memory_resource *mem_resource; +	struct pcc_register_resource *reg_resource; +	union acpi_object *out_obj, *member; +	acpi_handle handle, osc_handle; +	int ret = 0; + +	status = acpi_get_handle(NULL, "\\_SB", &handle); +	if (ACPI_FAILURE(status)) +		return -ENODEV; + +	status = acpi_get_handle(handle, "_OSC", &osc_handle); +	if (ACPI_SUCCESS(status)) { +		ret = pcc_cpufreq_do_osc(&osc_handle); +		if (ret) +			dprintk("probe: _OSC evaluation did not succeed\n"); +		/* Firmware's use of _OSC is optional */ +		ret = 0; +	} + +	status = acpi_evaluate_object(handle, "PCCH", NULL, &output); +	if (ACPI_FAILURE(status)) +		return -ENODEV; + +	out_obj = output.pointer; +	if (out_obj->type != ACPI_TYPE_PACKAGE) { +		ret = -ENODEV; +		goto out_free; +	} + +	member = &out_obj->package.elements[0]; +	if (member->type != ACPI_TYPE_BUFFER) { +		ret = -ENODEV; +		goto out_free; +	} + +	mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + +	dprintk("probe: mem_resource descriptor: 0x%x," +		" length: %d, space_id: %d, resource_usage: %d," +		" type_specific: %d, granularity: 0x%llx," +		" minimum: 0x%llx, maximum: 0x%llx," +		" translation_offset: 0x%llx, address_length: 0x%llx\n", +		mem_resource->descriptor, mem_resource->length, +		mem_resource->space_id, mem_resource->resource_usage, +		mem_resource->type_specific, mem_resource->granularity, +		mem_resource->minimum, mem_resource->maximum, +		mem_resource->translation_offset, +		mem_resource->address_length); + +	if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { +		ret = -ENODEV; +		goto out_free; +	} + +	pcch_virt_addr = ioremap_nocache(mem_resource->minimum, +					mem_resource->address_length); +	if (pcch_virt_addr == NULL) { +		dprintk("probe: could not map shared mem region\n"); +		goto out_free; +	} +	pcch_hdr = pcch_virt_addr; + +	dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); +	dprintk("probe: PCCH header is at physical address: 0x%llx," +		" signature: 0x%x, length: %d bytes, major: %d, minor: %d," +		" supported features: 0x%x, command field: 0x%x," +		" status field: 0x%x, nominal latency: %d us\n", +		mem_resource->minimum, ioread32(&pcch_hdr->signature), +		ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), +		ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), +		ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), +		ioread32(&pcch_hdr->latency)); + +	dprintk("probe: min time between commands: %d us," +		" max time between commands: %d us," +		" nominal CPU frequency: %d MHz," +		" minimum CPU frequency: %d MHz," +		" minimum CPU frequency without throttling: %d MHz\n", +		ioread32(&pcch_hdr->minimum_time), +		ioread32(&pcch_hdr->maximum_time), +		ioread32(&pcch_hdr->nominal), +		ioread32(&pcch_hdr->throttled_frequency), +		ioread32(&pcch_hdr->minimum_frequency)); + +	member = &out_obj->package.elements[1]; +	if (member->type != ACPI_TYPE_BUFFER) { +		ret = -ENODEV; +		goto pcch_free; +	} + +	reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + +	doorbell.space_id = reg_resource->space_id; +	doorbell.bit_width = reg_resource->bit_width; +	doorbell.bit_offset = reg_resource->bit_offset; +	doorbell.access_width = 64; +	doorbell.address = reg_resource->address; + +	dprintk("probe: doorbell: space_id is %d, bit_width is %d, " +		"bit_offset is %d, access_width is %d, address is 0x%llx\n", +		doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, +		doorbell.access_width, reg_resource->address); + +	member = &out_obj->package.elements[2]; +	if (member->type != ACPI_TYPE_INTEGER) { +		ret = -ENODEV; +		goto pcch_free; +	} + +	doorbell_preserve = member->integer.value; + +	member = &out_obj->package.elements[3]; +	if (member->type != ACPI_TYPE_INTEGER) { +		ret = -ENODEV; +		goto pcch_free; +	} + +	doorbell_write = member->integer.value; + +	dprintk("probe: doorbell_preserve: 0x%llx," +		" doorbell_write: 0x%llx\n", +		doorbell_preserve, doorbell_write); + +	pcc_cpu_info = alloc_percpu(struct pcc_cpu); +	if (!pcc_cpu_info) { +		ret = -ENOMEM; +		goto pcch_free; +	} + +	printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" +	       " limits: %d MHz, %d MHz\n", PCC_VERSION, +	       ioread32(&pcch_hdr->minimum_frequency), +	       ioread32(&pcch_hdr->nominal)); +	kfree(output.pointer); +	return ret; +pcch_free: +	pcc_clear_mapping(); +out_free: +	kfree(output.pointer); +	return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ +	unsigned int cpu = policy->cpu; +	unsigned int result = 0; + +	if (!pcch_virt_addr) { +		result = -1; +		goto pcch_null; +	} + +	result = pcc_get_offset(cpu); +	if (result) { +		dprintk("init: PCCP evaluation failed\n"); +		goto free; +	} + +	policy->max = policy->cpuinfo.max_freq = +		ioread32(&pcch_hdr->nominal) * 1000; +	policy->min = policy->cpuinfo.min_freq = +		ioread32(&pcch_hdr->minimum_frequency) * 1000; +	policy->cur = pcc_get_freq(cpu); + +	dprintk("init: policy->max is %d, policy->min is %d\n", +		policy->max, policy->min); + +	return 0; +free: +	pcc_clear_mapping(); +	free_percpu(pcc_cpu_info); +pcch_null: +	return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ +	return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { +	.flags = CPUFREQ_CONST_LOOPS, +	.get = pcc_get_freq, +	.verify = pcc_cpufreq_verify, +	.target = pcc_cpufreq_target, +	.init = pcc_cpufreq_cpu_init, +	.exit = pcc_cpufreq_cpu_exit, +	.name = "pcc-cpufreq", +	.owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ +	int ret; + +	if (acpi_disabled) +		return 0; + +	ret = pcc_cpufreq_probe(); +	if (ret) { +		dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); +		return ret; +	} + +	ret = cpufreq_register_driver(&pcc_cpufreq_driver); + +	return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ +	cpufreq_unregister_driver(&pcc_cpufreq_driver); + +	pcc_clear_mapping(); + +	free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 879666f4d871..7e1cca13af35 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)  	if (c->x86_power & (1 << 8)) {  		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); -		sched_clock_stable = 1; +		if (!check_tsc_unstable()) +			sched_clock_stable = 1;  	}  	/* diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index eddb1bdd1b8f..b3eeb66c0a51 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -903,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,  	return ret;  } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = {  	.show   = show,  	.store  = store,  }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513c..3ab9c886b613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,13 @@  #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + +#define rcu_dereference_check_mce(p) \ +	rcu_dereference_check((p), \ +			      rcu_read_lock_sched_held() || \ +			      lockdep_is_held(&mce_read_mutex)) +  #define CREATE_TRACE_POINTS  #include <trace/events/mce.h> @@ -158,7 +165,7 @@ void mce_log(struct mce *mce)  	mce->finished = 0;  	wmb();  	for (;;) { -		entry = rcu_dereference(mcelog.next); +		entry = rcu_dereference_check_mce(mcelog.next);  		for (;;) {  			/*  			 * When the buffer fills up discard new entries. @@ -1485,8 +1492,6 @@ static void collect_tscs(void *data)  	rdtscll(cpu_tsc[smp_processor_id()]);  } -static DEFINE_MUTEX(mce_read_mutex); -  static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,  			loff_t *off)  { @@ -1500,7 +1505,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,  		return -ENOMEM;  	mutex_lock(&mce_read_mutex); -	next = rcu_dereference(mcelog.next); +	next = rcu_dereference_check_mce(mcelog.next);  	/* Only supports full reads right now */  	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { @@ -1565,7 +1570,7 @@ timeout:  static unsigned int mce_poll(struct file *file, poll_table *wait)  {  	poll_wait(file, &mce_wait, wait); -	if (rcu_dereference(mcelog.next)) +	if (rcu_dereference_check_mce(mcelog.next))  		return POLLIN | POLLRDNORM;  	return 0;  } @@ -2044,6 +2049,7 @@ static __init void mce_init_banks(void)  		struct mce_bank *b = &mce_banks[i];  		struct sysdev_attribute *a = &b->attr; +		sysfs_attr_init(&a->attr);  		a->attr.name	= b->attrname;  		snprintf(b->attrname, ATTR_LEN, "bank%d", i); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 83a3d1f4efca..cda932ca3ade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,  	return ret;  } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = {  	.show			= show,  	.store			= store,  }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 7c785634af2b..d15df6e49bf0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,7 +95,7 @@ static void cmci_discover(int banks, int boot)  		/* Already owned by someone else? */  		if (val & CMCI_EN) { -			if (test_and_clear_bit(i, owned) || boot) +			if (test_and_clear_bit(i, owned) && !boot)  				print_update("SHD", &hdr, i);  			__clear_bit(i, __get_cpu_var(mce_poll_banks));  			continue; @@ -107,7 +107,7 @@ static void cmci_discover(int banks, int boot)  		/* Did the enable bit stick? -- the bank supports CMCI */  		if (val & CMCI_EN) { -			if (!test_and_set_bit(i, owned) || boot) +			if (!test_and_set_bit(i, owned) && !boot)  				print_update("CMCI", &hdr, i);  			__clear_bit(i, __get_cpu_var(mce_poll_banks));  		} else { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index fe4622e8c837..79556bd9b602 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -145,6 +145,7 @@ struct set_mtrr_data {  /**   * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data   *   * Returns nothing.   */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..60398a0d947c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -73,10 +73,10 @@ struct debug_store {  struct event_constraint {  	union {  		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; -		u64		idxmsk64[1]; +		u64		idxmsk64;  	}; -	int	code; -	int	cmask; +	u64	code; +	u64	cmask;  	int	weight;  }; @@ -103,7 +103,7 @@ struct cpu_hw_events {  };  #define __EVENT_CONSTRAINT(c, n, m, w) {\ -	{ .idxmsk64[0] = (n) },		\ +	{ .idxmsk64 = (n) },		\  	.code = (c),			\  	.cmask = (m),			\  	.weight = (w),			\ @@ -116,7 +116,7 @@ struct cpu_hw_events {  	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)  #define FIXED_EVENT_CONSTRAINT(c, n)	\ -	EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)  #define EVENT_CONSTRAINT_END		\  	EVENT_CONSTRAINT(0, 0, 0) @@ -133,8 +133,8 @@ struct x86_pmu {  	int		(*handle_irq)(struct pt_regs *);  	void		(*disable_all)(void);  	void		(*enable_all)(void); -	void		(*enable)(struct hw_perf_event *, int); -	void		(*disable)(struct hw_perf_event *, int); +	void		(*enable)(struct perf_event *); +	void		(*disable)(struct perf_event *);  	unsigned	eventsel;  	unsigned	perfctr;  	u64		(*event_map)(int); @@ -157,6 +157,11 @@ struct x86_pmu {  	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,  						 struct perf_event *event);  	struct event_constraint *event_constraints; + +	void		(*cpu_prepare)(int cpu); +	void		(*cpu_starting)(int cpu); +	void		(*cpu_dying)(int cpu); +	void		(*cpu_dead)(int cpu);  };  static struct x86_pmu x86_pmu __read_mostly; @@ -165,8 +170,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {  	.enabled = 1,  }; -static int x86_perf_event_set_period(struct perf_event *event, -			     struct hw_perf_event *hwc, int idx); +static int x86_perf_event_set_period(struct perf_event *event);  /*   * Generalized hw caching related hw_event table, filled @@ -189,11 +193,12 @@ static u64 __read_mostly hw_cache_event_ids   * Returns the delta events processed.   */  static u64 -x86_perf_event_update(struct perf_event *event, -			struct hw_perf_event *hwc, int idx) +x86_perf_event_update(struct perf_event *event)  { +	struct hw_perf_event *hwc = &event->hw;  	int shift = 64 - x86_pmu.event_bits;  	u64 prev_raw_count, new_raw_count; +	int idx = hwc->idx;  	s64 delta;  	if (idx == X86_PMC_IDX_FIXED_BTS) @@ -293,7 +298,7 @@ static inline bool bts_available(void)  	return x86_pmu.enable_bts != NULL;  } -static inline void init_debug_store_on_cpu(int cpu) +static void init_debug_store_on_cpu(int cpu)  {  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -305,7 +310,7 @@ static inline void init_debug_store_on_cpu(int cpu)  		     (u32)((u64)(unsigned long)ds >> 32));  } -static inline void fini_debug_store_on_cpu(int cpu) +static void fini_debug_store_on_cpu(int cpu)  {  	if (!per_cpu(cpu_hw_events, cpu).ds)  		return; @@ -503,6 +508,9 @@ static int __hw_perf_event_init(struct perf_event *event)  	 */  	if (attr->type == PERF_TYPE_RAW) {  		hwc->config |= x86_pmu.raw_event(attr->config); +		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && +		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) +			return -EACCES;  		return 0;  	} @@ -553,9 +561,9 @@ static void x86_pmu_disable_all(void)  		if (!test_bit(idx, cpuc->active_mask))  			continue;  		rdmsrl(x86_pmu.eventsel + idx, val); -		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) +		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))  			continue; -		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;  		wrmsrl(x86_pmu.eventsel + idx, val);  	}  } @@ -590,7 +598,7 @@ static void x86_pmu_enable_all(void)  			continue;  		val = event->hw.config; -		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		val |= ARCH_PERFMON_EVENTSEL_ENABLE;  		wrmsrl(x86_pmu.eventsel + idx, val);  	}  } @@ -612,8 +620,8 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  	bitmap_zero(used_mask, X86_PMC_IDX_MAX);  	for (i = 0; i < n; i++) { -		constraints[i] = -		  x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); +		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); +		constraints[i] = c;  	}  	/* @@ -635,7 +643,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  		if (test_bit(hwc->idx, used_mask))  			break; -		set_bit(hwc->idx, used_mask); +		__set_bit(hwc->idx, used_mask);  		if (assign)  			assign[i] = hwc->idx;  	} @@ -676,7 +684,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  			if (c->weight != w)  				continue; -			for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { +			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {  				if (!test_bit(j, used_mask))  					break;  			} @@ -684,7 +692,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  			if (j == X86_PMC_IDX_MAX)  				break; -			set_bit(j, used_mask); +			__set_bit(j, used_mask);  			if (assign)  				assign[i] = j; @@ -777,6 +785,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,  		hwc->last_tag == cpuc->tags[i];  } +static int x86_pmu_start(struct perf_event *event);  static void x86_pmu_stop(struct perf_event *event);  void hw_perf_enable(void) @@ -793,6 +802,7 @@ void hw_perf_enable(void)  		return;  	if (cpuc->n_added) { +		int n_running = cpuc->n_events - cpuc->n_added;  		/*  		 * apply assignment obtained either from  		 * hw_perf_group_sched_in() or x86_pmu_enable() @@ -800,8 +810,7 @@ void hw_perf_enable(void)  		 * step1: save events moving to new counters  		 * step2: reprogram moved events into new counters  		 */ -		for (i = 0; i < cpuc->n_events; i++) { - +		for (i = 0; i < n_running; i++) {  			event = cpuc->event_list[i];  			hwc = &event->hw; @@ -816,29 +825,18 @@ void hw_perf_enable(void)  				continue;  			x86_pmu_stop(event); - -			hwc->idx = -1;  		}  		for (i = 0; i < cpuc->n_events; i++) { -  			event = cpuc->event_list[i];  			hwc = &event->hw; -			if (hwc->idx == -1) { +			if (!match_prev_assignment(hwc, cpuc, i))  				x86_assign_hw_event(event, cpuc, i); -				x86_perf_event_set_period(event, hwc, hwc->idx); -			} -			/* -			 * need to mark as active because x86_pmu_disable() -			 * clear active_mask and events[] yet it preserves -			 * idx -			 */ -			set_bit(hwc->idx, cpuc->active_mask); -			cpuc->events[hwc->idx] = event; +			else if (i < n_running) +				continue; -			x86_pmu.enable(hwc, hwc->idx); -			perf_event_update_userpage(event); +			x86_pmu_start(event);  		}  		cpuc->n_added = 0;  		perf_events_lapic_init(); @@ -850,15 +848,16 @@ void hw_perf_enable(void)  	x86_pmu.enable_all();  } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)  { -	(void)checking_wrmsrl(hwc->config_base + idx, -			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +	(void)checking_wrmsrl(hwc->config_base + hwc->idx, +			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);  } -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void x86_pmu_disable_event(struct perf_event *event)  { -	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config); +	struct hw_perf_event *hwc = &event->hw; +	(void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);  }  static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -868,12 +867,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);   * To be called with the event disabled in hw:   */  static int -x86_perf_event_set_period(struct perf_event *event, -			     struct hw_perf_event *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event)  { +	struct hw_perf_event *hwc = &event->hw;  	s64 left = atomic64_read(&hwc->period_left);  	s64 period = hwc->sample_period; -	int err, ret = 0; +	int err, ret = 0, idx = hwc->idx;  	if (idx == X86_PMC_IDX_FIXED_BTS)  		return 0; @@ -919,11 +918,11 @@ x86_perf_event_set_period(struct perf_event *event,  	return ret;  } -static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct perf_event *event)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	if (cpuc->enabled) -		__x86_pmu_enable_event(hwc, idx); +		__x86_pmu_enable_event(&event->hw);  }  /* @@ -959,34 +958,32 @@ static int x86_pmu_enable(struct perf_event *event)  	memcpy(cpuc->assign, assign, n*sizeof(int));  	cpuc->n_events = n; -	cpuc->n_added  = n - n0; +	cpuc->n_added += n - n0;  	return 0;  }  static int x86_pmu_start(struct perf_event *event)  { -	struct hw_perf_event *hwc = &event->hw; +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	int idx = event->hw.idx; -	if (hwc->idx == -1) +	if (idx == -1)  		return -EAGAIN; -	x86_perf_event_set_period(event, hwc, hwc->idx); -	x86_pmu.enable(hwc, hwc->idx); +	x86_perf_event_set_period(event); +	cpuc->events[idx] = event; +	__set_bit(idx, cpuc->active_mask); +	x86_pmu.enable(event); +	perf_event_update_userpage(event);  	return 0;  }  static void x86_pmu_unthrottle(struct perf_event *event)  { -	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct hw_perf_event *hwc = &event->hw; - -	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || -				cpuc->events[hwc->idx] != event)) -		return; - -	x86_pmu.enable(hwc, hwc->idx); +	int ret = x86_pmu_start(event); +	WARN_ON_ONCE(ret);  }  void perf_event_print_debug(void) @@ -1046,18 +1043,16 @@ static void x86_pmu_stop(struct perf_event *event)  	struct hw_perf_event *hwc = &event->hw;  	int idx = hwc->idx; -	/* -	 * Must be done before we disable, otherwise the nmi handler -	 * could reenable again: -	 */ -	clear_bit(idx, cpuc->active_mask); -	x86_pmu.disable(hwc, idx); +	if (!__test_and_clear_bit(idx, cpuc->active_mask)) +		return; + +	x86_pmu.disable(event);  	/*  	 * Drain the remaining delta count out of a event  	 * that we are disabling:  	 */ -	x86_perf_event_update(event, hwc, idx); +	x86_perf_event_update(event);  	cpuc->events[idx] = NULL;  } @@ -1094,8 +1089,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)  	int idx, handled = 0;  	u64 val; -	data.addr = 0; -	data.raw = NULL; +	perf_sample_data_init(&data, 0);  	cpuc = &__get_cpu_var(cpu_hw_events); @@ -1106,7 +1100,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)  		event = cpuc->events[idx];  		hwc = &event->hw; -		val = x86_perf_event_update(event, hwc, idx); +		val = x86_perf_event_update(event);  		if (val & (1ULL << (x86_pmu.event_bits - 1)))  			continue; @@ -1116,11 +1110,11 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)  		handled		= 1;  		data.period	= event->hw.last_period; -		if (!x86_perf_event_set_period(event, hwc, idx)) +		if (!x86_perf_event_set_period(event))  			continue;  		if (perf_event_overflow(event, 1, &data, regs)) -			x86_pmu.disable(hwc, idx); +			x86_pmu_stop(event);  	}  	if (handled) @@ -1307,7 +1301,7 @@ int hw_perf_group_sched_in(struct perf_event *leader,  	memcpy(cpuc->assign, assign, n0*sizeof(int));  	cpuc->n_events  = n0; -	cpuc->n_added   = n1; +	cpuc->n_added  += n1;  	ctx->nr_active += n1;  	/* @@ -1335,6 +1329,39 @@ undo:  #include "perf_event_p6.c"  #include "perf_event_intel.c" +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ +	unsigned int cpu = (long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_UP_PREPARE: +		if (x86_pmu.cpu_prepare) +			x86_pmu.cpu_prepare(cpu); +		break; + +	case CPU_STARTING: +		if (x86_pmu.cpu_starting) +			x86_pmu.cpu_starting(cpu); +		break; + +	case CPU_DYING: +		if (x86_pmu.cpu_dying) +			x86_pmu.cpu_dying(cpu); +		break; + +	case CPU_DEAD: +		if (x86_pmu.cpu_dead) +			x86_pmu.cpu_dead(cpu); +		break; + +	default: +		break; +	} + +	return NOTIFY_OK; +} +  static void __init pmu_check_apic(void)  {  	if (cpu_has_apic) @@ -1347,6 +1374,7 @@ static void __init pmu_check_apic(void)  void __init init_hw_perf_events(void)  { +	struct event_constraint *c;  	int err;  	pr_info("Performance Events: "); @@ -1395,6 +1423,16 @@ void __init init_hw_perf_events(void)  		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,  				   0, x86_pmu.num_events); +	if (x86_pmu.event_constraints) { +		for_each_event_constraint(c, x86_pmu.event_constraints) { +			if (c->cmask != INTEL_ARCH_FIXED_MASK) +				continue; + +			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; +			c->weight += x86_pmu.num_events; +		} +	} +  	pr_info("... version:                %d\n",     x86_pmu.version);  	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);  	pr_info("... generic registers:      %d\n",     x86_pmu.num_events); @@ -1402,11 +1440,13 @@ void __init init_hw_perf_events(void)  	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);  	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);  	pr_info("... event mask:             %016Lx\n", perf_event_mask); + +	perf_cpu_notifier(x86_pmu_notifier);  }  static inline void x86_pmu_read(struct perf_event *event)  { -	x86_perf_event_update(event, &event->hw, event->hw.idx); +	x86_perf_event_update(event);  }  static const struct pmu pmu = { @@ -1662,28 +1702,16 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)  	return entry;  } -void hw_perf_event_setup_online(int cpu) +#ifdef CONFIG_EVENT_TRACING +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)  { -	init_debug_store_on_cpu(cpu); - -	switch (boot_cpu_data.x86_vendor) { -	case X86_VENDOR_AMD: -		amd_pmu_cpu_online(cpu); -		break; -	default: -		return; -	} -} - -void hw_perf_event_setup_offline(int cpu) -{ -	init_debug_store_on_cpu(cpu); - -	switch (boot_cpu_data.x86_vendor) { -	case X86_VENDOR_AMD: -		amd_pmu_cpu_offline(cpu); -		break; -	default: -		return; -	} +	regs->ip = ip; +	/* +	 * perf_arch_fetch_caller_regs adds another call, we need to increment +	 * the skip level +	 */ +	regs->bp = rewind_frame_pointer(skip + 1); +	regs->cs = __KERNEL_CS; +	local_save_flags(regs->flags);  } +#endif diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 8f3dbfda3c4f..b87e0b6970cb 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -271,28 +271,6 @@ done:  	return &emptyconstraint;  } -static __initconst struct x86_pmu amd_pmu = { -	.name			= "AMD", -	.handle_irq		= x86_pmu_handle_irq, -	.disable_all		= x86_pmu_disable_all, -	.enable_all		= x86_pmu_enable_all, -	.enable			= x86_pmu_enable_event, -	.disable		= x86_pmu_disable_event, -	.eventsel		= MSR_K7_EVNTSEL0, -	.perfctr		= MSR_K7_PERFCTR0, -	.event_map		= amd_pmu_event_map, -	.raw_event		= amd_pmu_raw_event, -	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), -	.num_events		= 4, -	.event_bits		= 48, -	.event_mask		= (1ULL << 48) - 1, -	.apic			= 1, -	/* use highest bit to detect overflow */ -	.max_period		= (1ULL << 47) - 1, -	.get_event_constraints	= amd_get_event_constraints, -	.put_event_constraints	= amd_put_event_constraints -}; -  static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)  {  	struct amd_nb *nb; @@ -309,7 +287,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)  	 * initialize all possible NB constraints  	 */  	for (i = 0; i < x86_pmu.num_events; i++) { -		set_bit(i, nb->event_constraints[i].idxmsk); +		__set_bit(i, nb->event_constraints[i].idxmsk);  		nb->event_constraints[i].weight = 1;  	}  	return nb; @@ -370,14 +348,41 @@ static void amd_pmu_cpu_offline(int cpu)  	raw_spin_lock(&amd_nb_lock); -	if (--cpuhw->amd_nb->refcnt == 0) -		kfree(cpuhw->amd_nb); +	if (cpuhw->amd_nb) { +		if (--cpuhw->amd_nb->refcnt == 0) +			kfree(cpuhw->amd_nb); -	cpuhw->amd_nb = NULL; +		cpuhw->amd_nb = NULL; +	}  	raw_spin_unlock(&amd_nb_lock);  } +static __initconst struct x86_pmu amd_pmu = { +	.name			= "AMD", +	.handle_irq		= x86_pmu_handle_irq, +	.disable_all		= x86_pmu_disable_all, +	.enable_all		= x86_pmu_enable_all, +	.enable			= x86_pmu_enable_event, +	.disable		= x86_pmu_disable_event, +	.eventsel		= MSR_K7_EVNTSEL0, +	.perfctr		= MSR_K7_PERFCTR0, +	.event_map		= amd_pmu_event_map, +	.raw_event		= amd_pmu_raw_event, +	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), +	.num_events		= 4, +	.event_bits		= 48, +	.event_mask		= (1ULL << 48) - 1, +	.apic			= 1, +	/* use highest bit to detect overflow */ +	.max_period		= (1ULL << 47) - 1, +	.get_event_constraints	= amd_get_event_constraints, +	.put_event_constraints	= amd_put_event_constraints, + +	.cpu_prepare		= amd_pmu_cpu_online, +	.cpu_dead		= amd_pmu_cpu_offline, +}; +  static __init int amd_pmu_init(void)  {  	/* Performance-monitoring supported from K7 and later: */ @@ -390,11 +395,6 @@ static __init int amd_pmu_init(void)  	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,  	       sizeof(hw_cache_event_ids)); -	/* -	 * explicitly initialize the boot cpu, other cpus will get -	 * the cpu hotplug callbacks from smp_init() -	 */ -	amd_pmu_cpu_online(smp_processor_id());  	return 0;  } @@ -405,12 +405,4 @@ static int amd_pmu_init(void)  	return 0;  } -static void amd_pmu_cpu_online(int cpu) -{ -} - -static void amd_pmu_cpu_offline(int cpu) -{ -} -  #endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..84bfde64a337 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,7 +1,7 @@  #ifdef CONFIG_CPU_SUP_INTEL  /* - * Intel PerfMon v3. Used on Core2 and later. + * Intel PerfMon, used on Core and later.   */  static const u64 intel_perfmon_event_map[] =  { @@ -27,8 +27,14 @@ static struct event_constraint intel_core_event_constraints[] =  static struct event_constraint intel_core2_event_constraints[] =  { -	FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ -	FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ +	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ +	/* +	 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event +	 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed +	 * ratio between these counters. +	 */ +	/* FIXED_EVENT_CONSTRAINT(0x013c, 2),  CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */  	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */  	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -37,14 +43,16 @@ static struct event_constraint intel_core2_event_constraints[] =  	INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */  	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */  	INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ +	INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */  	INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */  	EVENT_CONSTRAINT_END  };  static struct event_constraint intel_nehalem_event_constraints[] =  { -	FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ -	FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ +	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ +	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */  	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */  	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -58,8 +66,9 @@ static struct event_constraint intel_nehalem_event_constraints[] =  static struct event_constraint intel_westmere_event_constraints[] =  { -	FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ -	FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ +	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ +	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */  	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */  	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -68,8 +77,9 @@ static struct event_constraint intel_westmere_event_constraints[] =  static struct event_constraint intel_gen_event_constraints[] =  { -	FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ -	FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ +	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ +	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */  	EVENT_CONSTRAINT_END  }; @@ -538,9 +548,9 @@ static inline void intel_pmu_ack_status(u64 ack)  }  static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc)  { -	int idx = __idx - X86_PMC_IDX_FIXED; +	int idx = hwc->idx - X86_PMC_IDX_FIXED;  	u64 ctrl_val, mask;  	mask = 0xfULL << (idx * 4); @@ -580,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void)  	ds->bts_index = ds->bts_buffer_base; +	perf_sample_data_init(&data, 0);  	data.period	= event->hw.last_period; -	data.addr	= 0; -	data.raw	= NULL;  	regs.ip		= 0;  	/* @@ -612,26 +621,28 @@ static void intel_pmu_drain_bts_buffer(void)  }  static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +intel_pmu_disable_event(struct perf_event *event)  { -	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { +	struct hw_perf_event *hwc = &event->hw; + +	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {  		intel_pmu_disable_bts();  		intel_pmu_drain_bts_buffer();  		return;  	}  	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { -		intel_pmu_disable_fixed(hwc, idx); +		intel_pmu_disable_fixed(hwc);  		return;  	} -	x86_pmu_disable_event(hwc, idx); +	x86_pmu_disable_event(event);  }  static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc)  { -	int idx = __idx - X86_PMC_IDX_FIXED; +	int idx = hwc->idx - X86_PMC_IDX_FIXED;  	u64 ctrl_val, bits, mask;  	int err; @@ -661,9 +672,11 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)  	err = checking_wrmsrl(hwc->config_base, ctrl_val);  } -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void intel_pmu_enable_event(struct perf_event *event)  { -	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { +	struct hw_perf_event *hwc = &event->hw; + +	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {  		if (!__get_cpu_var(cpu_hw_events).enabled)  			return; @@ -672,11 +685,11 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)  	}  	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { -		intel_pmu_enable_fixed(hwc, idx); +		intel_pmu_enable_fixed(hwc);  		return;  	} -	__x86_pmu_enable_event(hwc, idx); +	__x86_pmu_enable_event(hwc);  }  /* @@ -685,14 +698,8 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)   */  static int intel_pmu_save_and_restart(struct perf_event *event)  { -	struct hw_perf_event *hwc = &event->hw; -	int idx = hwc->idx; -	int ret; - -	x86_perf_event_update(event, hwc, idx); -	ret = x86_perf_event_set_period(event, hwc, idx); - -	return ret; +	x86_perf_event_update(event); +	return x86_perf_event_set_period(event);  }  static void intel_pmu_reset(void) @@ -732,16 +739,15 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)  	int bit, loops;  	u64 ack, status; -	data.addr = 0; -	data.raw = NULL; +	perf_sample_data_init(&data, 0);  	cpuc = &__get_cpu_var(cpu_hw_events); -	perf_disable(); +	intel_pmu_disable_all();  	intel_pmu_drain_bts_buffer();  	status = intel_pmu_get_status();  	if (!status) { -		perf_enable(); +		intel_pmu_enable_all();  		return 0;  	} @@ -751,16 +757,14 @@ again:  		WARN_ONCE(1, "perfevents: irq loop stuck!\n");  		perf_event_print_debug();  		intel_pmu_reset(); -		perf_enable(); -		return 1; +		goto done;  	}  	inc_irq_stat(apic_perf_irqs);  	ack = status; -	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { +	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {  		struct perf_event *event = cpuc->events[bit]; -		clear_bit(bit, (unsigned long *) &status);  		if (!test_bit(bit, cpuc->active_mask))  			continue; @@ -770,7 +774,7 @@ again:  		data.period = event->hw.last_period;  		if (perf_event_overflow(event, 1, &data, regs)) -			intel_pmu_disable_event(&event->hw, bit); +			x86_pmu_stop(event);  	}  	intel_pmu_ack_status(ack); @@ -782,8 +786,8 @@ again:  	if (status)  		goto again; -	perf_enable(); - +done: +	intel_pmu_enable_all();  	return 1;  } @@ -862,7 +866,10 @@ static __initconst struct x86_pmu intel_pmu = {  	.max_period		= (1ULL << 31) - 1,  	.enable_bts		= intel_pmu_enable_bts,  	.disable_bts		= intel_pmu_disable_bts, -	.get_event_constraints	= intel_get_event_constraints +	.get_event_constraints	= intel_get_event_constraints, + +	.cpu_starting		= init_debug_store_on_cpu, +	.cpu_dying		= fini_debug_store_on_cpu,  };  static __init int intel_pmu_init(void) @@ -935,7 +942,7 @@ static __init int intel_pmu_init(void)  		x86_pmu.event_constraints = intel_nehalem_event_constraints;  		pr_cont("Nehalem/Corei7 events, ");  		break; -	case 28: +	case 28: /* Atom */  		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); @@ -951,6 +958,7 @@ static __init int intel_pmu_init(void)  		x86_pmu.event_constraints = intel_westmere_event_constraints;  		pr_cont("Westmere events, ");  		break; +  	default:  		/*  		 * default constraints for v2 and up diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 1ca5ba078afd..a330485d14da 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -62,7 +62,7 @@ static void p6_pmu_disable_all(void)  	/* p6 only has one enable register */  	rdmsrl(MSR_P6_EVNTSEL0, val); -	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;  	wrmsrl(MSR_P6_EVNTSEL0, val);  } @@ -72,32 +72,34 @@ static void p6_pmu_enable_all(void)  	/* p6 only has one enable register */  	rdmsrl(MSR_P6_EVNTSEL0, val); -	val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +	val |= ARCH_PERFMON_EVENTSEL_ENABLE;  	wrmsrl(MSR_P6_EVNTSEL0, val);  }  static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +p6_pmu_disable_event(struct perf_event *event)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	struct hw_perf_event *hwc = &event->hw;  	u64 val = P6_NOP_EVENT;  	if (cpuc->enabled) -		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		val |= ARCH_PERFMON_EVENTSEL_ENABLE; -	(void)checking_wrmsrl(hwc->config_base + idx, val); +	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);  } -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void p6_pmu_enable_event(struct perf_event *event)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	struct hw_perf_event *hwc = &event->hw;  	u64 val;  	val = hwc->config;  	if (cpuc->enabled) -		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		val |= ARCH_PERFMON_EVENTSEL_ENABLE; -	(void)checking_wrmsrl(hwc->config_base + idx, val); +	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);  }  static __initconst struct x86_pmu p6_pmu = { diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 74f4e85a5727..fb329e9f8494 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -680,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)  	cpu_nmi_set_wd_enabled();  	apic_write(APIC_LVTPC, APIC_DM_NMI); -	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; +	evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;  	wrmsr(evntsel_msr, evntsel, 0);  	intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);  	return 1; diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 4fd1420faffa..29e5f7c845b2 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,4 +29,19 @@ struct stack_frame {  	struct stack_frame *next_frame;  	unsigned long return_address;  }; + +static inline unsigned long rewind_frame_pointer(int n) +{ +	struct stack_frame *frame; + +	get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER +	while (n--) +		frame = frame->next_frame;  #endif + +	return (unsigned long)frame; +} + +#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index dce99abb4496..272c9f1f05f3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -120,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,  {  #ifdef CONFIG_FRAME_POINTER  	struct stack_frame *frame = (struct stack_frame *)bp; +	unsigned long next; -	if (!in_irq_stack(stack, irq_stack, irq_stack_end)) -		return (unsigned long)frame->next_frame; +	if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { +		if (!probe_kernel_address(&frame->next_frame, next)) +			return next; +		else +			WARN_ONCE(1, "Perf: bad frame pointer = %p in " +				  "callchain\n", &frame->next_frame); +	}  #endif  	return bp;  } @@ -202,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  			if (in_irq_stack(stack, irq_stack, irq_stack_end)) {  				if (ops->stack(data, "IRQ") < 0)  					break; -				bp = print_context_stack(tinfo, stack, bp, +				bp = ops->walk_stack(tinfo, stack, bp,  					ops, data, irq_stack_end, &graph);  				/*  				 * We link to the next stack (which would be @@ -223,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	/*  	 * This handles the process stack:  	 */ -	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); +	bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);  	put_cpu();  }  EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index adedeef1dedc..b2e246037392 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -7,6 +7,7 @@  #include <linux/init.h>  #include <linux/start_kernel.h> +#include <linux/mm.h>  #include <asm/setup.h>  #include <asm/sections.h> @@ -44,9 +45,10 @@ void __init i386_start_kernel(void)  #ifdef CONFIG_BLK_DEV_INITRD  	/* Reserve INITRD */  	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { +		/* Assume only end is not page aligned */  		u64 ramdisk_image = boot_params.hdr.ramdisk_image;  		u64 ramdisk_size  = boot_params.hdr.ramdisk_size; -		u64 ramdisk_end   = ramdisk_image + ramdisk_size; +		u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);  		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");  	}  #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896ca1e7..7147143fd614 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data)  #ifdef CONFIG_BLK_DEV_INITRD  	/* Reserve INITRD */  	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { +		/* Assume only end is not page aligned */  		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;  		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size; -		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size; +		unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);  		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");  	}  #endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2d8b5035371c..3d1e6f16b7a6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -27,7 +27,7 @@  #define GET_CR2_INTO_RCX movq %cr2, %rcx  #endif -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE +/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE   * because we need identity-mapped pages.   *   */ diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index dca2802c666f..d6cc065f519f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -344,13 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,  	}  	/* -	 * For kernel-addresses, either the address or symbol name can be -	 * specified. -	 */ -	if (info->name) -		info->address = (unsigned long) -				kallsyms_lookup_name(info->name); -	/*  	 * Check that the low-order bits of the address are appropriate  	 * for the alignment implied by len.  	 */ @@ -535,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)  {  	/* TODO */  } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ -	/* TODO */ -} diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8c93a84bb627..fb725ee15f55 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -34,6 +34,12 @@  static int i8259A_auto_eoi;  DEFINE_RAW_SPINLOCK(i8259A_lock);  static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq);  struct irq_chip i8259A_chip = {  	.name		= "XT-PIC", @@ -63,7 +69,7 @@ unsigned int cached_irq_mask = 0xffff;   */  unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq)  {  	unsigned int mask = 1 << irq;  	unsigned long flags; @@ -77,7 +83,7 @@ void disable_8259A_irq(unsigned int irq)  	raw_spin_unlock_irqrestore(&i8259A_lock, flags);  } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq)  {  	unsigned int mask = ~(1 << irq);  	unsigned long flags; @@ -91,7 +97,7 @@ void enable_8259A_irq(unsigned int irq)  	raw_spin_unlock_irqrestore(&i8259A_lock, flags);  } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq)  {  	unsigned int mask = 1<<irq;  	unsigned long flags; @@ -107,7 +113,7 @@ int i8259A_irq_pending(unsigned int irq)  	return ret;  } -void make_8259A_irq(unsigned int irq) +static void make_8259A_irq(unsigned int irq)  {  	disable_irq_nosync(irq);  	io_apic_irqs &= ~(1<<irq); @@ -281,7 +287,7 @@ static int __init i8259A_init_sysfs(void)  device_initcall(i8259A_init_sysfs); -void mask_8259A(void) +static void mask_8259A(void)  {  	unsigned long flags; @@ -293,7 +299,7 @@ void mask_8259A(void)  	raw_spin_unlock_irqrestore(&i8259A_lock, flags);  } -void unmask_8259A(void) +static void unmask_8259A(void)  {  	unsigned long flags; @@ -305,7 +311,7 @@ void unmask_8259A(void)  	raw_spin_unlock_irqrestore(&i8259A_lock, flags);  } -void init_8259A(int auto_eoi) +static void init_8259A(int auto_eoi)  {  	unsigned long flags; @@ -358,3 +364,47 @@ void init_8259A(int auto_eoi)  	raw_spin_unlock_irqrestore(&i8259A_lock, flags);  } + +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; + +static struct irq_chip dummy_pic_chip  = { +	.name = "dummy pic", +	.mask = legacy_pic_uint_noop, +	.unmask = legacy_pic_uint_noop, +	.disable = legacy_pic_uint_noop, +	.mask_ack = legacy_pic_uint_noop, +}; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ +	return 0; +} + +struct legacy_pic null_legacy_pic = { +	.nr_legacy_irqs = 0, +	.chip = &dummy_pic_chip, +	.mask_all = legacy_pic_noop, +	.restore_mask = legacy_pic_noop, +	.init = legacy_pic_int_noop, +	.irq_pending = legacy_pic_irq_pending_noop, +	.make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { +	.nr_legacy_irqs = NR_IRQS_LEGACY, +	.chip  = &i8259A_chip, +	.mask_all  = mask_8259A, +	.restore_mask = unmask_8259A, +	.init = init_8259A, +	.irq_pending = i8259A_irq_pending, +	.make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index fce55d532631..f01d390f9c5b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -99,9 +99,6 @@ int vector_used_by_percpu_irq(unsigned int vector)  	return 0;  } -/* Number of legacy interrupts */ -int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; -  void __init init_ISA_irqs(void)  {  	int i; @@ -109,12 +106,12 @@ void __init init_ISA_irqs(void)  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)  	init_bsp_APIC();  #endif -	init_8259A(0); +	legacy_pic->init(0);  	/*  	 * 16 old-style INTA-cycle interrupts:  	 */ -	for (i = 0; i < NR_IRQS_LEGACY; i++) { +	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {  		struct irq_desc *desc = irq_to_desc(i);  		desc->status = IRQ_DISABLED; @@ -138,12 +135,34 @@ void __init init_IRQ(void)  	 * then this vector space can be freed and re-used dynamically as the  	 * irq's migrate etc.  	 */ -	for (i = 0; i < nr_legacy_irqs; i++) +	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)  		per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;  	x86_init.irqs.intr_init();  } +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC +	int irq; + +	/* +	 * On most of the platforms, legacy PIC delivers the interrupts on the +	 * boot cpu. But there are certain platforms where PIC interrupts are +	 * delivered to multiple cpu's. If the legacy IRQ is handled by the +	 * legacy PIC, for the new cpu that is coming online, setup the static +	 * legacy vector to irq mapping: +	 */ +	for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) +		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + +	__setup_vector_irq(cpu); +} +  static void __init smp_intr_init(void)  {  #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index cbc4332a77b2..9b895464dd03 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -121,3 +121,17 @@ void k8_flush_garts(void)  }  EXPORT_SYMBOL_GPL(k8_flush_garts); +static __init int init_k8_nbs(void) +{ +	int err = 0; + +	err = cache_k8_northbridges(); + +	if (err < 0) +		printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + +	return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5de9f4a9c3fd..b43bbaebe2c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@  #include <linux/module.h>  #include <linux/kdebug.h>  #include <linux/kallsyms.h> +#include <linux/ftrace.h>  #include <asm/cacheflush.h>  #include <asm/desc.h> @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {  };  const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)  { -	struct __arch_jmp_op { -		char op; +	struct __arch_relative_insn { +		u8 op;  		s32 raddr; -	} __attribute__((packed)) * jop; -	jop = (struct __arch_jmp_op *)from; -	jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); -	jop->op = RELATIVEJUMP_INSTRUCTION; +	} __attribute__((packed)) *insn; + +	insn = (struct __arch_relative_insn *)from; +	insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); +	insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ +	__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);  }  /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)  	/*  	 *  Basically, kp->ainsn.insn has an original instruction.  	 *  However, RIP-relative instruction can not do single-stepping -	 *  at different place, fix_riprel() tweaks the displacement of +	 *  at different place, __copy_instruction() tweaks the displacement of  	 *  that instruction. In that case, we can't recover the instruction  	 *  from the kp->ainsn.insn.  	 * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)  }  /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode.   * If it does, Return the address of the 32-bit displacement word.   * If not, return null.   * Only applicable to 64-bit x86.   */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)  { -#ifdef CONFIG_X86_64  	struct insn insn; -	kernel_insn_init(&insn, p->ainsn.insn); +	int ret; +	kprobe_opcode_t buf[MAX_INSN_SIZE]; +	kernel_insn_init(&insn, src); +	if (recover) { +		insn_get_opcode(&insn); +		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { +			ret = recover_probed_instruction(buf, +							 (unsigned long)src); +			if (ret) +				return 0; +			kernel_insn_init(&insn, buf); +		} +	} +	insn_get_length(&insn); +	memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64  	if (insn_rip_relative(&insn)) {  		s64 newdisp;  		u8 *disp; +		kernel_insn_init(&insn, dest);  		insn_get_displacement(&insn);  		/*  		 * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)  		 * extension of the original signed 32-bit displacement would  		 * have given.  		 */ -		newdisp = (u8 *) p->addr + (s64) insn.displacement.value - -			  (u8 *) p->ainsn.insn; +		newdisp = (u8 *) src + (s64) insn.displacement.value - +			  (u8 *) dest;  		BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */ -		disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); +		disp = (u8 *) dest + insn_offset_displacement(&insn);  		*(s32 *) disp = (s32) newdisp;  	}  #endif +	return insn.length;  }  static void __kprobes arch_copy_kprobe(struct kprobe *p)  { -	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - -	fix_riprel(p); +	/* +	 * Copy an instruction without recovering int3, because it will be +	 * put by another subsystem. +	 */ +	__copy_instruction(p->ainsn.insn, p->addr, 0);  	if (can_boost(p->addr))  		p->ainsn.boostable = 0; @@ -406,18 +432,6 @@ static void __kprobes restore_btf(void)  		update_debugctlmsr(current->thread.debugctlmsr);  } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ -	clear_btf(); -	regs->flags |= X86_EFLAGS_TF; -	regs->flags &= ~X86_EFLAGS_IF; -	/* single step inline if the instruction is an int3 */ -	if (p->opcode == BREAKPOINT_INSTRUCTION) -		regs->ip = (unsigned long)p->addr; -	else -		regs->ip = (unsigned long)p->ainsn.insn; -} -  void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,  				      struct pt_regs *regs)  { @@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,  	*sara = (unsigned long) &kretprobe_trampoline;  } +#ifdef CONFIG_OPTPROBES +static int  __kprobes setup_detour_execution(struct kprobe *p, +					     struct pt_regs *regs, +					     int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif +  static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, -				       struct kprobe_ctlblk *kcb) +				       struct kprobe_ctlblk *kcb, int reenter)  { +	if (setup_detour_execution(p, regs, reenter)) +		return; +  #if !defined(CONFIG_PREEMPT)  	if (p->ainsn.boostable == 1 && !p->post_handler) {  		/* Boost up -- we can execute copied instructions directly */ -		reset_current_kprobe(); +		if (!reenter) +			reset_current_kprobe(); +		/* +		 * Reentering boosted probe doesn't reset current_kprobe, +		 * nor set current_kprobe, because it doesn't use single +		 * stepping. +		 */  		regs->ip = (unsigned long)p->ainsn.insn;  		preempt_enable_no_resched();  		return;  	}  #endif -	prepare_singlestep(p, regs); -	kcb->kprobe_status = KPROBE_HIT_SS; +	if (reenter) { +		save_previous_kprobe(kcb); +		set_current_kprobe(p, regs, kcb); +		kcb->kprobe_status = KPROBE_REENTER; +	} else +		kcb->kprobe_status = KPROBE_HIT_SS; +	/* Prepare real single stepping */ +	clear_btf(); +	regs->flags |= X86_EFLAGS_TF; +	regs->flags &= ~X86_EFLAGS_IF; +	/* single step inline if the instruction is an int3 */ +	if (p->opcode == BREAKPOINT_INSTRUCTION) +		regs->ip = (unsigned long)p->addr; +	else +		regs->ip = (unsigned long)p->ainsn.insn;  }  /* @@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,  	switch (kcb->kprobe_status) {  	case KPROBE_HIT_SSDONE:  	case KPROBE_HIT_ACTIVE: -		save_previous_kprobe(kcb); -		set_current_kprobe(p, regs, kcb);  		kprobes_inc_nmissed_count(p); -		prepare_singlestep(p, regs); -		kcb->kprobe_status = KPROBE_REENTER; +		setup_singlestep(p, regs, kcb, 1);  		break;  	case KPROBE_HIT_SS:  		/* A probe has been hit in the codepath leading up to, or just @@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  			 * more here.  			 */  			if (!p->pre_handler || !p->pre_handler(p, regs)) -				setup_singlestep(p, regs, kcb); +				setup_singlestep(p, regs, kcb, 0);  			return 1;  		}  	} else if (kprobe_running()) {  		p = __get_cpu_var(current_kprobe);  		if (p->break_handler && p->break_handler(p, regs)) { -			setup_singlestep(p, regs, kcb); +			setup_singlestep(p, regs, kcb, 0);  			return 1;  		}  	} /* else: not a kprobe fault; let the kernel handle it */ @@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  	return 0;  } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING		\ +	/* Skip cs, ip, orig_ax. */	\ +	"	subq $24, %rsp\n"	\ +	"	pushq %rdi\n"		\ +	"	pushq %rsi\n"		\ +	"	pushq %rdx\n"		\ +	"	pushq %rcx\n"		\ +	"	pushq %rax\n"		\ +	"	pushq %r8\n"		\ +	"	pushq %r9\n"		\ +	"	pushq %r10\n"		\ +	"	pushq %r11\n"		\ +	"	pushq %rbx\n"		\ +	"	pushq %rbp\n"		\ +	"	pushq %r12\n"		\ +	"	pushq %r13\n"		\ +	"	pushq %r14\n"		\ +	"	pushq %r15\n" +#define RESTORE_REGS_STRING		\ +	"	popq %r15\n"		\ +	"	popq %r14\n"		\ +	"	popq %r13\n"		\ +	"	popq %r12\n"		\ +	"	popq %rbp\n"		\ +	"	popq %rbx\n"		\ +	"	popq %r11\n"		\ +	"	popq %r10\n"		\ +	"	popq %r9\n"		\ +	"	popq %r8\n"		\ +	"	popq %rax\n"		\ +	"	popq %rcx\n"		\ +	"	popq %rdx\n"		\ +	"	popq %rsi\n"		\ +	"	popq %rdi\n"		\ +	/* Skip orig_ax, ip, cs */	\ +	"	addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING		\ +	/* Skip cs, ip, orig_ax and gs. */	\ +	"	subl $16, %esp\n"	\ +	"	pushl %fs\n"		\ +	"	pushl %ds\n"		\ +	"	pushl %es\n"		\ +	"	pushl %eax\n"		\ +	"	pushl %ebp\n"		\ +	"	pushl %edi\n"		\ +	"	pushl %esi\n"		\ +	"	pushl %edx\n"		\ +	"	pushl %ecx\n"		\ +	"	pushl %ebx\n" +#define RESTORE_REGS_STRING		\ +	"	popl %ebx\n"		\ +	"	popl %ecx\n"		\ +	"	popl %edx\n"		\ +	"	popl %esi\n"		\ +	"	popl %edi\n"		\ +	"	popl %ebp\n"		\ +	"	popl %eax\n"		\ +	/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ +	"	addl $24, %esp\n" +#endif +  /*   * When a retprobed function returns, this code saves registers and   * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)  			/* We don't bother saving the ss register */  			"	pushq %rsp\n"  			"	pushfq\n" -			/* -			 * Skip cs, ip, orig_ax. -			 * trampoline_handler() will plug in these values -			 */ -			"	subq $24, %rsp\n" -			"	pushq %rdi\n" -			"	pushq %rsi\n" -			"	pushq %rdx\n" -			"	pushq %rcx\n" -			"	pushq %rax\n" -			"	pushq %r8\n" -			"	pushq %r9\n" -			"	pushq %r10\n" -			"	pushq %r11\n" -			"	pushq %rbx\n" -			"	pushq %rbp\n" -			"	pushq %r12\n" -			"	pushq %r13\n" -			"	pushq %r14\n" -			"	pushq %r15\n" +			SAVE_REGS_STRING  			"	movq %rsp, %rdi\n"  			"	call trampoline_handler\n"  			/* Replace saved sp with true return address. */  			"	movq %rax, 152(%rsp)\n" -			"	popq %r15\n" -			"	popq %r14\n" -			"	popq %r13\n" -			"	popq %r12\n" -			"	popq %rbp\n" -			"	popq %rbx\n" -			"	popq %r11\n" -			"	popq %r10\n" -			"	popq %r9\n" -			"	popq %r8\n" -			"	popq %rax\n" -			"	popq %rcx\n" -			"	popq %rdx\n" -			"	popq %rsi\n" -			"	popq %rdi\n" -			/* Skip orig_ax, ip, cs */ -			"	addq $24, %rsp\n" +			RESTORE_REGS_STRING  			"	popfq\n"  #else  			"	pushf\n" -			/* -			 * Skip cs, ip, orig_ax and gs. -			 * trampoline_handler() will plug in these values -			 */ -			"	subl $16, %esp\n" -			"	pushl %fs\n" -			"	pushl %es\n" -			"	pushl %ds\n" -			"	pushl %eax\n" -			"	pushl %ebp\n" -			"	pushl %edi\n" -			"	pushl %esi\n" -			"	pushl %edx\n" -			"	pushl %ecx\n" -			"	pushl %ebx\n" +			SAVE_REGS_STRING  			"	movl %esp, %eax\n"  			"	call trampoline_handler\n"  			/* Move flags to cs */ @@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)  			"	movl %edx, 52(%esp)\n"  			/* Replace saved flags with true return address. */  			"	movl %eax, 56(%esp)\n" -			"	popl %ebx\n" -			"	popl %ecx\n" -			"	popl %edx\n" -			"	popl %esi\n" -			"	popl %edi\n" -			"	popl %ebp\n" -			"	popl %eax\n" -			/* Skip ds, es, fs, gs, orig_ax and ip */ -			"	addl $24, %esp\n" +			RESTORE_REGS_STRING  			"	popf\n"  #endif  			"	ret\n"); @@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,  			 * These instructions can be executed directly if it  			 * jumps back to correct address.  			 */ -			set_jmp_op((void *)regs->ip, -				   (void *)orig_ip + (regs->ip - copy_ip)); +			synthesize_reljump((void *)regs->ip, +				(void *)orig_ip + (regs->ip - copy_ip));  			p->ainsn.boostable = 1;  		} else {  			p->ainsn.boostable = -1; @@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)  	return 0;  } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ +	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, +					  unsigned long val) +{ +#ifdef CONFIG_X86_64 +	*addr++ = 0x48; +	*addr++ = 0xbf; +#else +	*addr++ = 0xb8; +#endif +	*(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ +	asm volatile ( +			".global optprobe_template_entry\n" +			"optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 +			/* We don't bother saving the ss register */ +			"	pushq %rsp\n" +			"	pushfq\n" +			SAVE_REGS_STRING +			"	movq %rsp, %rsi\n" +			".global optprobe_template_val\n" +			"optprobe_template_val: \n" +			ASM_NOP5 +			ASM_NOP5 +			".global optprobe_template_call\n" +			"optprobe_template_call: \n" +			ASM_NOP5 +			/* Move flags to rsp */ +			"	movq 144(%rsp), %rdx\n" +			"	movq %rdx, 152(%rsp)\n" +			RESTORE_REGS_STRING +			/* Skip flags entry */ +			"	addq $8, %rsp\n" +			"	popfq\n" +#else /* CONFIG_X86_32 */ +			"	pushf\n" +			SAVE_REGS_STRING +			"	movl %esp, %edx\n" +			".global optprobe_template_val\n" +			"optprobe_template_val: \n" +			ASM_NOP5 +			".global optprobe_template_call\n" +			"optprobe_template_call: \n" +			ASM_NOP5 +			RESTORE_REGS_STRING +			"	addl $4, %esp\n"	/* skip cs */ +			"	popf\n" +#endif +			".global optprobe_template_end\n" +			"optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ +	((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ +	((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ +	((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, +					 struct pt_regs *regs) +{ +	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + +	preempt_disable(); +	if (kprobe_running()) { +		kprobes_inc_nmissed_count(&op->kp); +	} else { +		/* Save skipped registers */ +#ifdef CONFIG_X86_64 +		regs->cs = __KERNEL_CS; +#else +		regs->cs = __KERNEL_CS | get_kernel_rpl(); +		regs->gs = 0; +#endif +		regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; +		regs->orig_ax = ~0UL; + +		__get_cpu_var(current_kprobe) = &op->kp; +		kcb->kprobe_status = KPROBE_HIT_ACTIVE; +		opt_pre_handler(&op->kp, regs); +		__get_cpu_var(current_kprobe) = NULL; +	} +	preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ +	int len = 0, ret; + +	while (len < RELATIVEJUMP_SIZE) { +		ret = __copy_instruction(dest + len, src + len, 1); +		if (!ret || !can_boost(dest + len)) +			return -EINVAL; +		len += ret; +	} +	/* Check whether the address range is reserved */ +	if (ftrace_text_reserved(src, src + len - 1) || +	    alternatives_text_reserved(src, src + len - 1)) +		return -EBUSY; + +	return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ +	return ((insn->opcode.bytes[0] == 0xff && +		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ +		insn->opcode.bytes[0] == 0xea);	/* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ +	unsigned long target = 0; + +	switch (insn->opcode.bytes[0]) { +	case 0xe0:	/* loopne */ +	case 0xe1:	/* loope */ +	case 0xe2:	/* loop */ +	case 0xe3:	/* jcxz */ +	case 0xe9:	/* near relative jump */ +	case 0xeb:	/* short relative jump */ +		break; +	case 0x0f: +		if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ +			break; +		return 0; +	default: +		if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ +			break; +		return 0; +	} +	target = (unsigned long)insn->next_byte + insn->immediate.value; + +	return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ +	int ret; +	unsigned long addr, size = 0, offset = 0; +	struct insn insn; +	kprobe_opcode_t buf[MAX_INSN_SIZE]; +	/* Dummy buffers for lookup_symbol_attrs */ +	static char __dummy_buf[KSYM_NAME_LEN]; + +	/* Lookup symbol including addr */ +	if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) +		return 0; + +	/* Check there is enough space for a relative jump. */ +	if (size - offset < RELATIVEJUMP_SIZE) +		return 0; + +	/* Decode instructions */ +	addr = paddr - offset; +	while (addr < paddr - offset + size) { /* Decode until function end */ +		if (search_exception_tables(addr)) +			/* +			 * Since some fixup code will jumps into this function, +			 * we can't optimize kprobe in this function. +			 */ +			return 0; +		kernel_insn_init(&insn, (void *)addr); +		insn_get_opcode(&insn); +		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { +			ret = recover_probed_instruction(buf, addr); +			if (ret) +				return 0; +			kernel_insn_init(&insn, buf); +		} +		insn_get_length(&insn); +		/* Recover address */ +		insn.kaddr = (void *)addr; +		insn.next_byte = (void *)(addr + insn.length); +		/* Check any instructions don't jump into target */ +		if (insn_is_indirect_jump(&insn) || +		    insn_jump_into_range(&insn, paddr + INT3_SIZE, +					 RELATIVE_ADDR_SIZE)) +			return 0; +		addr += insn.length; +	} + +	return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ +	int i; +	struct kprobe *p; + +	for (i = 1; i < op->optinsn.size; i++) { +		p = get_kprobe(op->kp.addr + i); +		if (p && !kprobe_disabled(p)) +			return -EEXIST; +	} + +	return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, +					   unsigned long addr) +{ +	return ((unsigned long)op->kp.addr <= addr && +		(unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ +	if (op->optinsn.insn) { +		free_optinsn_slot(op->optinsn.insn, dirty); +		op->optinsn.insn = NULL; +		op->optinsn.size = 0; +	} +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ +	__arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ +	u8 *buf; +	int ret; +	long rel; + +	if (!can_optimize((unsigned long)op->kp.addr)) +		return -EILSEQ; + +	op->optinsn.insn = get_optinsn_slot(); +	if (!op->optinsn.insn) +		return -ENOMEM; + +	/* +	 * Verify if the address gap is in 2GB range, because this uses +	 * a relative jump. +	 */ +	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; +	if (abs(rel) > 0x7fffffff) +		return -ERANGE; + +	buf = (u8 *)op->optinsn.insn; + +	/* Copy instructions into the out-of-line buffer */ +	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); +	if (ret < 0) { +		__arch_remove_optimized_kprobe(op, 0); +		return ret; +	} +	op->optinsn.size = ret; + +	/* Copy arch-dep-instance from template */ +	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + +	/* Set probe information */ +	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + +	/* Set probe function call */ +	synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + +	/* Set returning jmp instruction at the tail of out-of-line buffer */ +	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, +			   (u8 *)op->kp.addr + op->optinsn.size); + +	flush_icache_range((unsigned long) buf, +			   (unsigned long) buf + TMPL_END_IDX + +			   op->optinsn.size + RELATIVEJUMP_SIZE); +	return 0; +} + +/* Replace a breakpoint (int3) with a relative jump.  */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ +	unsigned char jmp_code[RELATIVEJUMP_SIZE]; +	s32 rel = (s32)((long)op->optinsn.insn - +			((long)op->kp.addr + RELATIVEJUMP_SIZE)); + +	/* Backup instructions which will be replaced by jump address */ +	memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, +	       RELATIVE_ADDR_SIZE); + +	jmp_code[0] = RELATIVEJUMP_OPCODE; +	*(s32 *)(&jmp_code[1]) = rel; + +	/* +	 * text_poke_smp doesn't support NMI/MCE code modifying. +	 * However, since kprobes itself also doesn't support NMI/MCE +	 * code probing, it's not a problem. +	 */ +	text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); +	return 0; +} + +/* Replace a relative jump with a breakpoint (int3).  */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ +	u8 buf[RELATIVEJUMP_SIZE]; + +	/* Set int3 to first byte for kprobes */ +	buf[0] = BREAKPOINT_INSTRUCTION; +	memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); +	text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int  __kprobes setup_detour_execution(struct kprobe *p, +					     struct pt_regs *regs, +					     int reenter) +{ +	struct optimized_kprobe *op; + +	if (p->flags & KPROBE_FLAG_OPTIMIZED) { +		/* This kprobe is really able to run optimized path. */ +		op = container_of(p, struct optimized_kprobe, kp); +		/* Detour through copied instructions */ +		regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; +		if (!reenter) +			reset_current_kprobe(); +		preempt_enable_no_resched(); +		return 1; +	} +	return 0; +} +#endif +  int __init arch_init_kprobes(void)  {  	return 0; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 3b7078abc871..0aad8670858e 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -10,8 +10,211 @@   * of the License.   */  #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sfi.h> +#include <linux/irq.h> +#include <linux/module.h>  #include <asm/setup.h> +#include <asm/mpspec_def.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/mrst.h> +#include <asm/io.h> +#include <asm/i8259.h> +#include <asm/apb_timer.h> + +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +int sfi_mtimer_num; + +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); +int sfi_mrtc_num; + +static inline void assign_to_mp_irq(struct mpc_intsrc *m, +				    struct mpc_intsrc *mp_irq) +{ +	memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, +				struct mpc_intsrc *m) +{ +	return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static void save_mp_irq(struct mpc_intsrc *m) +{ +	int i; + +	for (i = 0; i < mp_irq_entries; i++) { +		if (!mp_irq_cmp(&mp_irqs[i], m)) +			return; +	} + +	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); +	if (++mp_irq_entries == MAX_IRQ_SOURCES) +		panic("Max # of irq sources exceeded!!\n"); +} + +/* parse all the mtimer info to a static mtimer array */ +static int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ +	struct sfi_table_simple *sb; +	struct sfi_timer_table_entry *pentry; +	struct mpc_intsrc mp_irq; +	int totallen; + +	sb = (struct sfi_table_simple *)table; +	if (!sfi_mtimer_num) { +		sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, +					struct sfi_timer_table_entry); +		pentry = (struct sfi_timer_table_entry *) sb->pentry; +		totallen = sfi_mtimer_num * sizeof(*pentry); +		memcpy(sfi_mtimer_array, pentry, totallen); +	} + +	printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); +	pentry = sfi_mtimer_array; +	for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { +		printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," +			" irq = %d\n", totallen, (u32)pentry->phys_addr, +			pentry->freq_hz, pentry->irq); +			if (!pentry->irq) +				continue; +			mp_irq.type = MP_IOAPIC; +			mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ +			mp_irq.irqflag = 5; +			mp_irq.srcbus = 0; +			mp_irq.srcbusirq = pentry->irq;	/* IRQ */ +			mp_irq.dstapic = MP_APIC_ALL; +			mp_irq.dstirq = pentry->irq; +			save_mp_irq(&mp_irq); +	} + +	return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ +	int i; +	if (hint < sfi_mtimer_num) { +		if (!sfi_mtimer_usage[hint]) { +			pr_debug("hint taken for timer %d irq %d\n",\ +				hint, sfi_mtimer_array[hint].irq); +			sfi_mtimer_usage[hint] = 1; +			return &sfi_mtimer_array[hint]; +		} +	} +	/* take the first timer available */ +	for (i = 0; i < sfi_mtimer_num;) { +		if (!sfi_mtimer_usage[i]) { +			sfi_mtimer_usage[i] = 1; +			return &sfi_mtimer_array[i]; +		} +		i++; +	} +	return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ +	int i; +	for (i = 0; i < sfi_mtimer_num;) { +		if (mtmr->irq == sfi_mtimer_array[i].irq) { +			sfi_mtimer_usage[i] = 0; +			return; +		} +		i++; +	} +} + +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ +	struct sfi_table_simple *sb; +	struct sfi_rtc_table_entry *pentry; +	struct mpc_intsrc mp_irq; + +	int totallen; + +	sb = (struct sfi_table_simple *)table; +	if (!sfi_mrtc_num) { +		sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, +						struct sfi_rtc_table_entry); +		pentry = (struct sfi_rtc_table_entry *)sb->pentry; +		totallen = sfi_mrtc_num * sizeof(*pentry); +		memcpy(sfi_mrtc_array, pentry, totallen); +	} + +	printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); +	pentry = sfi_mrtc_array; +	for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { +		printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", +			totallen, (u32)pentry->phys_addr, pentry->irq); +		mp_irq.type = MP_IOAPIC; +		mp_irq.irqtype = mp_INT; +		mp_irq.irqflag = 0; +		mp_irq.srcbus = 0; +		mp_irq.srcbusirq = pentry->irq;	/* IRQ */ +		mp_irq.dstapic = MP_APIC_ALL; +		mp_irq.dstirq = pentry->irq; +		save_mp_irq(&mp_irq); +	} +	return 0; +} + +/* + * the secondary clock in Moorestown can be APBT or LAPIC clock, default to + * APBT but cmdline option can also override it. + */ +static void __cpuinit mrst_setup_secondary_clock(void) +{ +	/* restore default lapic clock if disabled by cmdline */ +	if (disable_apbt_percpu) +		return setup_secondary_APIC_clock(); +	apbt_setup_secondary_clock(); +} + +static unsigned long __init mrst_calibrate_tsc(void) +{ +	unsigned long flags, fast_calibrate; + +	local_irq_save(flags); +	fast_calibrate = apbt_quick_calibrate(); +	local_irq_restore(flags); + +	if (fast_calibrate) +		return fast_calibrate; + +	return 0; +} + +void __init mrst_time_init(void) +{ +	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); +	pre_init_apic_IRQ0(); +	apbt_time_init(); +} + +void __init mrst_rtc_init(void) +{ +	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); +} + +/* + * if we use per cpu apb timer, the bootclock already setup. if we use lapic + * timer and one apbt timer for broadcast, we need to set up lapic boot clock. + */ +static void __init mrst_setup_boot_clock(void) +{ +	pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); +	if (disable_apbt_percpu) +		setup_boot_APIC_clock(); +};  /*   * Moorestown specific x86_init function overrides and early setup @@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)  {  	x86_init.resources.probe_roms = x86_init_noop;  	x86_init.resources.reserve_resources = x86_init_noop; + +	x86_init.timers.timer_init = mrst_time_init; +	x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + +	x86_init.irqs.pre_vector_init = x86_init_noop; + +	x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + +	x86_platform.calibrate_tsc = mrst_calibrate_tsc; +	x86_init.pci.init = pci_mrst_init; +	x86_init.pci.fixup_irqs = x86_init_noop; + +	legacy_pic = &null_legacy_pic;  } diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 9d1d263f786f..8297160c41b3 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,7 +17,9 @@  #include <linux/spinlock.h>  #include <linux/io.h>  #include <linux/string.h> +  #include <asm/geode.h> +#include <asm/setup.h>  #include <asm/olpc.h>  #ifdef CONFIG_OPEN_FIRMWARE @@ -243,9 +245,11 @@ static int __init olpc_init(void)  	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,  			(unsigned char *) &olpc_platform_info.ecver, 1); -	/* check to see if the VSA exists */ -	if (cs5535_has_vsa2()) -		olpc_platform_info.flags |= OLPC_F_VSA; +#ifdef CONFIG_PCI_OLPC +	/* If the VSA exists let it emulate PCI, if not emulate in kernel */ +	if (!cs5535_has_vsa2()) +		x86_init.pci.arch_init = pci_olpc_init; +#endif  	printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",  			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 2bbde6078143..fb99f7edb341 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)  /*   * get_tce_space_from_tar():   * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base adress register of calgary iommu + * by reading the contents of the base address register of calgary iommu   */  static void __init get_tce_space_from_tar(void)  { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1aa966c565f9..a4ac764a6880 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -38,7 +38,7 @@ int iommu_detected __read_mostly = 0;   * This variable becomes 1 if iommu=pt is passed on the kernel command line.   * If this variable is 1, IOMMU implementations do no DMA translation for   * devices and allow every device to access to whole physical memory. This is - * useful if a user want to use an IOMMU only for KVM device assignment to + * useful if a user wants to use an IOMMU only for KVM device assignment to   * guests and not for driver dma translation.   */  int iommu_pass_through __read_mostly; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 34de53b46f87..f3af115a573a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -735,7 +735,7 @@ int __init gart_iommu_init(void)  	unsigned long scratch;  	long i; -	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) +	if (num_k8_northbridges == 0)  		return 0;  #ifndef CONFIG_AGP_AMD64 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 02d678065d7d..28ad9f4d8b94 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)  }  /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use)   */  static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)  { +	u64 val;  	if (c->x86_vendor != X86_VENDOR_AMD) -		return 0; - -	if (c->x86 < 0x0F) -		return 0; +		goto no_c1e_idle;  	/* Family 0x0f models < rev F do not have C1E */ -	if (c->x86 == 0x0f && c->x86_model < 0x40) -		return 0; +	if (c->x86 == 0x0F && c->x86_model >= 0x40) +		return 1; -	return 1; +	if (c->x86 == 0x10) { +		/* +		 * check OSVW bit for CPUs that are not affected +		 * by erratum #400 +		 */ +		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); +		if (val >= 2) { +			rdmsrl(MSR_AMD64_OSVW_STATUS, val); +			if (!(val & BIT(1))) +				goto no_c1e_idle; +		} +		return 1; +	} + +no_c1e_idle: +	return 0;  }  static cpumask_var_t c1e_mask; @@ -607,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)  {  #ifdef CONFIG_SMP  	if (pm_idle == poll_idle && smp_num_siblings > 1) { -		printk(KERN_WARNING "WARNING: polling idle and HT enabled," +		printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"  			" performance may degrade.\n");  	}  #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2d96aab82a48..a503b1fd04e5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,7 +581,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,  	struct perf_event_attr attr;  	/* -	 * We shoud have at least an inactive breakpoint at this +	 * We should have at least an inactive breakpoint at this  	 * slot. It means the user is writing dr7 without having  	 * written the address register first  	 */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a449bd..d76e18570c60 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -314,16 +314,17 @@ static void __init reserve_brk(void)  #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)  static void __init relocate_initrd(void)  { - +	/* Assume only end is not page aligned */  	u64 ramdisk_image = boot_params.hdr.ramdisk_image;  	u64 ramdisk_size  = boot_params.hdr.ramdisk_size; +	u64 area_size     = PAGE_ALIGN(ramdisk_size);  	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;  	u64 ramdisk_here;  	unsigned long slop, clen, mapaddr;  	char *p, *q;  	/* We need to move the initrd down into lowmem */ -	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, +	ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,  					 PAGE_SIZE);  	if (ramdisk_here == -1ULL) @@ -332,7 +333,7 @@ static void __init relocate_initrd(void)  	/* Note: this includes all the lowmem currently occupied by  	   the initrd, we rely on that fact to keep the data intact. */ -	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, +	reserve_early(ramdisk_here, ramdisk_here + area_size,  			 "NEW RAMDISK");  	initrd_start = ramdisk_here + PAGE_OFFSET;  	initrd_end   = initrd_start + ramdisk_size; @@ -376,9 +377,10 @@ static void __init relocate_initrd(void)  static void __init reserve_initrd(void)  { +	/* Assume only end is not page aligned */  	u64 ramdisk_image = boot_params.hdr.ramdisk_image;  	u64 ramdisk_size  = boot_params.hdr.ramdisk_size; -	u64 ramdisk_end   = ramdisk_image + ramdisk_size; +	u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);  	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;  	if (!boot_params.hdr.type_of_loader || diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a435c76d714e..06d98ae5a802 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -48,6 +48,7 @@  #include <linux/err.h>  #include <linux/nmi.h>  #include <linux/tboot.h> +#include <linux/stackprotector.h>  #include <asm/acpi.h>  #include <asm/desc.h> @@ -67,6 +68,7 @@  #include <linux/mc146818rtc.h>  #include <asm/smpboot_hooks.h> +#include <asm/i8259.h>  #ifdef CONFIG_X86_32  u8 apicid_2_node[MAX_APICID]; @@ -245,7 +247,7 @@ static void __cpuinit smp_callin(void)  	/*  	 * Need to setup vector mappings before we enable interrupts.  	 */ -	__setup_vector_irq(smp_processor_id()); +	setup_vector_irq(smp_processor_id());  	/*  	 * Get our bogomips.  	 * @@ -291,9 +293,9 @@ notrace static void __cpuinit start_secondary(void *unused)  	check_tsc_sync_target();  	if (nmi_watchdog == NMI_IO_APIC) { -		disable_8259A_irq(0); +		legacy_pic->chip->mask(0);  		enable_NMI_through_LVT0(); -		enable_8259A_irq(0); +		legacy_pic->chip->unmask(0);  	}  #ifdef CONFIG_X86_32 @@ -329,6 +331,9 @@ notrace static void __cpuinit start_secondary(void *unused)  	/* enable local interrupts */  	local_irq_enable(); +	/* to prevent fake stack check failure in clock setup */ +	boot_init_stack_canary(); +  	x86_cpuinit.setup_percpu_clockev();  	wmb(); diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index dee1ff7cba58..196552bb412c 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -25,191 +25,6 @@  #include <asm/syscalls.h>  /* - * Perform the select(nd, in, out, ex, tv) and mmap() system - * calls. Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct { -	unsigned long addr; -	unsigned long len; -	unsigned long prot; -	unsigned long flags; -	unsigned long fd; -	unsigned long offset; -}; - -asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) -{ -	struct mmap_arg_struct a; -	int err = -EFAULT; - -	if (copy_from_user(&a, arg, sizeof(a))) -		goto out; - -	err = -EINVAL; -	if (a.offset & ~PAGE_MASK) -		goto out; - -	err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, -			a.fd, a.offset >> PAGE_SHIFT); -out: -	return err; -} - - -struct sel_arg_struct { -	unsigned long n; -	fd_set __user *inp, *outp, *exp; -	struct timeval __user *tvp; -}; - -asmlinkage int old_select(struct sel_arg_struct __user *arg) -{ -	struct sel_arg_struct a; - -	if (copy_from_user(&a, arg, sizeof(a))) -		return -EFAULT; -	/* sys_select() does the appropriate kernel locking */ -	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); -} - -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, -			int third, void __user *ptr, long fifth) -{ -	int version, ret; - -	version = call >> 16; /* hack for backward compatibility */ -	call &= 0xffff; - -	switch (call) { -	case SEMOP: -		return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); -	case SEMTIMEDOP: -		return sys_semtimedop(first, (struct sembuf __user *)ptr, second, -					(const struct timespec __user *)fifth); - -	case SEMGET: -		return sys_semget(first, second, third); -	case SEMCTL: { -		union semun fourth; -		if (!ptr) -			return -EINVAL; -		if (get_user(fourth.__pad, (void __user * __user *) ptr)) -			return -EFAULT; -		return sys_semctl(first, second, third, fourth); -	} - -	case MSGSND: -		return sys_msgsnd(first, (struct msgbuf __user *) ptr, -				   second, third); -	case MSGRCV: -		switch (version) { -		case 0: { -			struct ipc_kludge tmp; -			if (!ptr) -				return -EINVAL; - -			if (copy_from_user(&tmp, -					   (struct ipc_kludge __user *) ptr, -					   sizeof(tmp))) -				return -EFAULT; -			return sys_msgrcv(first, tmp.msgp, second, -					   tmp.msgtyp, third); -		} -		default: -			return sys_msgrcv(first, -					   (struct msgbuf __user *) ptr, -					   second, fifth, third); -		} -	case MSGGET: -		return sys_msgget((key_t) first, second); -	case MSGCTL: -		return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); - -	case SHMAT: -		switch (version) { -		default: { -			ulong raddr; -			ret = do_shmat(first, (char __user *) ptr, second, &raddr); -			if (ret) -				return ret; -			return put_user(raddr, (ulong __user *) third); -		} -		case 1:	/* iBCS2 emulator entry point */ -			if (!segment_eq(get_fs(), get_ds())) -				return -EINVAL; -			/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ -			return do_shmat(first, (char __user *) ptr, second, (ulong *) third); -		} -	case SHMDT: -		return sys_shmdt((char __user *)ptr); -	case SHMGET: -		return sys_shmget(first, second, third); -	case SHMCTL: -		return sys_shmctl(first, second, -				   (struct shmid_ds __user *) ptr); -	default: -		return -ENOSYS; -	} -} - -/* - * Old cruft - */ -asmlinkage int sys_uname(struct old_utsname __user *name) -{ -	int err; -	if (!name) -		return -EFAULT; -	down_read(&uts_sem); -	err = copy_to_user(name, utsname(), sizeof(*name)); -	up_read(&uts_sem); -	return err? -EFAULT:0; -} - -asmlinkage int sys_olduname(struct oldold_utsname __user *name) -{ -	int error; - -	if (!name) -		return -EFAULT; -	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) -		return -EFAULT; - -	down_read(&uts_sem); - -	error = __copy_to_user(&name->sysname, &utsname()->sysname, -			       __OLD_UTS_LEN); -	error |= __put_user(0, name->sysname + __OLD_UTS_LEN); -	error |= __copy_to_user(&name->nodename, &utsname()->nodename, -				__OLD_UTS_LEN); -	error |= __put_user(0, name->nodename + __OLD_UTS_LEN); -	error |= __copy_to_user(&name->release, &utsname()->release, -				__OLD_UTS_LEN); -	error |= __put_user(0, name->release + __OLD_UTS_LEN); -	error |= __copy_to_user(&name->version, &utsname()->version, -				__OLD_UTS_LEN); -	error |= __put_user(0, name->version + __OLD_UTS_LEN); -	error |= __copy_to_user(&name->machine, &utsname()->machine, -				__OLD_UTS_LEN); -	error |= __put_user(0, name->machine + __OLD_UTS_LEN); - -	up_read(&uts_sem); - -	error = error ? -EFAULT : 0; - -	return error; -} - - -/*   * Do a system call from kernel instead of calling sys_execve so we   * end up with proper pt_regs.   */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8aa2057efd12..ff14a5044ce6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -209,15 +209,3 @@ bottomup:  	return addr;  } - - -SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) -{ -	int err; -	down_read(&uts_sem); -	err = copy_to_user(name, utsname(), sizeof(*name)); -	up_read(&uts_sem); -	if (personality(current->personality) == PER_LINUX32) -		err |= copy_to_user(&name->machine, "i686", 5); -	return err ? -EFAULT : 0; -} diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 15228b5d3eb7..8b3729341216 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -81,7 +81,7 @@ ENTRY(sys_call_table)  	.long sys_settimeofday  	.long sys_getgroups16	/* 80 */  	.long sys_setgroups16 -	.long old_select +	.long sys_old_select  	.long sys_symlink  	.long sys_lstat  	.long sys_readlink	/* 85 */ @@ -89,7 +89,7 @@ ENTRY(sys_call_table)  	.long sys_swapon  	.long sys_reboot  	.long sys_old_readdir -	.long old_mmap		/* 90 */ +	.long sys_old_mmap	/* 90 */  	.long sys_munmap  	.long sys_truncate  	.long sys_ftruncate diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 208a857c679f..9faf91ae1841 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -50,7 +50,7 @@ u64 native_sched_clock(void)  	 *   unstable. We do this because unlike Time Of Day,  	 *   the scheduler clock tolerates small errors and it's  	 *   very important for it to be as fast as the platform -	 *   can achive it. ) +	 *   can achieve it. )  	 */  	if (unlikely(tsc_disabled)) {  		/* No locking but a rare wrong value is not a big deal: */ diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index ab38ce0984fa..e680ea52db9b 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -49,11 +49,6 @@ extern int no_broadcast;  char visws_board_type	= -1;  char visws_board_rev	= -1; -int is_visws_box(void) -{ -	return visws_board_type >= 0; -} -  static void __init visws_time_init(void)  {  	printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -242,6 +237,8 @@ void __init visws_early_detect(void)  	x86_init.irqs.pre_vector_init = visws_pre_intr_init;  	x86_init.irqs.trap_init = visws_trap_init;  	x86_init.timers.timer_init = visws_time_init; +	x86_init.pci.init = pci_visws_init; +	x86_init.pci.init_irq = x86_init_noop;  	/*  	 * Install reboot quirks: @@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {   */  static unsigned int startup_piix4_master_irq(unsigned int irq)  { -	init_8259A(0); +	legacy_pic->init(0);  	return startup_cobalt_irq(irq);  } @@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = {  static struct irq_chip piix4_virtual_irq_type = {  	.name =		"PIIX4-virtual", -	.shutdown =	disable_8259A_irq, -	.enable =	enable_8259A_irq, -	.disable =	disable_8259A_irq,  }; @@ -609,7 +603,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)  		handle_IRQ_event(realirq, desc->action);  	if (!(desc->status & IRQ_DISABLED)) -		enable_8259A_irq(realirq); +		legacy_pic->chip->unmask(realirq);  	return IRQ_HANDLED; @@ -628,6 +622,12 @@ static struct irqaction cascade_action = {  	.name =		"cascade",  }; +static inline void set_piix4_virtual_irq_type(void) +{ +	piix4_virtual_irq_type.shutdown = i8259A_chip.mask; +	piix4_virtual_irq_type.enable =	i8259A_chip.unmask; +	piix4_virtual_irq_type.disable = i8259A_chip.mask; +}  void init_VISWS_APIC_irqs(void)  { @@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)  			desc->chip = &piix4_master_irq_type;  		}  		else if (i < CO_IRQ_APIC0) { +			set_piix4_virtual_irq_type();  			desc->chip = &piix4_virtual_irq_type;  		}  		else if (IS_CO_APIC(i)) { diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2f1ca5614292..5e1ff66ecd73 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -167,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,  {  	/* Unfortunately, set_next_event interface only passes relative  	 * expiry, but we want absolute expiry.  It'd be better if were -	 * were passed an aboslute expiry, since a bunch of time may +	 * were passed an absolute expiry, since a bunch of time may  	 * have been stolen between the time the delta is computed and  	 * when we set the alarm below. */  	cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 44879df55696..2cc249718c46 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,8 +291,8 @@ SECTIONS  	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {  		__smp_locks = .;  		*(.smp_locks) -		__smp_locks_end = .;  		. = ALIGN(PAGE_SIZE); +		__smp_locks_end = .;  	}  #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff0..1c0c6ab9c60f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void)  	register_sysctl_table(kernel_root_table2);  #endif  	on_each_cpu(cpu_vsyscall_init, NULL, 1); -	hotcpu_notifier(cpu_vsyscall_notifier, 0); +	/* notifier priority > KVM */ +	hotcpu_notifier(cpu_vsyscall_notifier, 30);  	return 0;  } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ee5746c94628..61a1e8c7e19f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -4,9 +4,11 @@   *  For licencing details see kernel-base/COPYING   */  #include <linux/init.h> +#include <linux/ioport.h>  #include <asm/bios_ebda.h>  #include <asm/paravirt.h> +#include <asm/pci_x86.h>  #include <asm/mpspec.h>  #include <asm/setup.h>  #include <asm/apic.h> @@ -70,6 +72,12 @@ struct x86_init_ops x86_init __initdata = {  	.iommu = {  		.iommu_init		= iommu_init_noop,  	}, + +	.pci = { +		.init			= x86_default_pci_init, +		.init_irq		= x86_default_pci_init_irq, +		.fixup_irqs		= x86_default_pci_fixup_irqs, +	},  };  struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3c4d0109ad20..970bbd479516 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM  	select HAVE_KVM_EVENTFD  	select KVM_APIC_ARCHITECTURE  	select USER_RETURN_NOTIFIER +	select KVM_MMIO  	---help---  	  Support hosting fully virtualized guest machines using hardware  	  virtualization extensions.  You will need a fairly recent diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651e..4dade6ac0827 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,7 +32,7 @@  #include <linux/module.h>  #include <asm/kvm_emulate.h> -#include "mmu.h"		/* for is_long_mode() */ +#include "x86.h"  /*   * Opcode effective-address decode tables. @@ -76,6 +76,8 @@  #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */  #define GroupMask   0xff        /* Group number stored in bits 0:7 */  /* Misc flags */ +#define Lock        (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */  #define No64	    (1<<28)  /* Source 2 operand type */  #define Src2None    (0<<29) @@ -88,39 +90,40 @@  enum {  	Group1_80, Group1_81, Group1_82, Group1_83,  	Group1A, Group3_Byte, Group3, Group4, Group5, Group7, +	Group8, Group9,  };  static u32 opcode_table[256] = {  	/* 0x00 - 0x07 */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,  	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,  	/* 0x08 - 0x0F */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,  	ImplicitOps | Stack | No64, 0,  	/* 0x10 - 0x17 */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,  	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,  	/* 0x18 - 0x1F */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,  	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,  	/* 0x20 - 0x27 */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,  	/* 0x28 - 0x2F */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	0, 0, 0, 0,  	/* 0x30 - 0x37 */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,  	0, 0, 0, 0,  	/* 0x38 - 0x3F */ @@ -156,7 +159,7 @@ static u32 opcode_table[256] = {  	Group | Group1_80, Group | Group1_81,  	Group | Group1_82, Group | Group1_83,  	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	/* 0x88 - 0x8F */  	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,  	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -210,7 +213,7 @@ static u32 opcode_table[256] = {  	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,  	/* 0xF0 - 0xF7 */  	0, 0, 0, 0, -	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, +	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,  	/* 0xF8 - 0xFF */  	ImplicitOps, 0, ImplicitOps, ImplicitOps,  	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -218,16 +221,20 @@ static u32 opcode_table[256] = {  static u32 twobyte_table[256] = {  	/* 0x00 - 0x0F */ -	0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, -	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, +	0, Group | GroupDual | Group7, 0, 0, +	0, ImplicitOps, ImplicitOps | Priv, 0, +	ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, +	0, ImplicitOps | ModRM, 0, 0,  	/* 0x10 - 0x1F */  	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,  	/* 0x20 - 0x2F */ -	ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, +	ModRM | ImplicitOps | Priv, ModRM | Priv, +	ModRM | ImplicitOps | Priv, ModRM | Priv, +	0, 0, 0, 0,  	0, 0, 0, 0, 0, 0, 0, 0,  	/* 0x30 - 0x3F */ -	ImplicitOps, 0, ImplicitOps, 0, -	ImplicitOps, ImplicitOps, 0, 0, +	ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, +	ImplicitOps, ImplicitOps | Priv, 0, 0,  	0, 0, 0, 0, 0, 0, 0, 0,  	/* 0x40 - 0x47 */  	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -257,21 +264,23 @@ static u32 twobyte_table[256] = {  	DstMem | SrcReg | Src2CL | ModRM, 0, 0,  	/* 0xA8 - 0xAF */  	ImplicitOps | Stack, ImplicitOps | Stack, -	0, DstMem | SrcReg | ModRM | BitOp, +	0, DstMem | SrcReg | ModRM | BitOp | Lock,  	DstMem | SrcReg | Src2ImmByte | ModRM,  	DstMem | SrcReg | Src2CL | ModRM,  	ModRM, 0,  	/* 0xB0 - 0xB7 */ -	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, -	    DstMem | SrcReg | ModRM | BitOp, +	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, +	0, DstMem | SrcReg | ModRM | BitOp | Lock,  	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,  	    DstReg | SrcMem16 | ModRM | Mov,  	/* 0xB8 - 0xBF */ -	0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, +	0, 0, +	Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,  	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,  	    DstReg | SrcMem16 | ModRM | Mov,  	/* 0xC0 - 0xCF */ -	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, +	0, 0, 0, DstMem | SrcReg | ModRM | Mov, +	0, 0, 0, Group | GroupDual | Group9,  	0, 0, 0, 0, 0, 0, 0, 0,  	/* 0xD0 - 0xDF */  	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,25 +292,41 @@ static u32 twobyte_table[256] = {  static u32 group_table[] = {  	[Group1_80*8] = -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM | Lock, +	ByteOp | DstMem | SrcImm | ModRM,  	[Group1_81*8] = -	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, -	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, -	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, -	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM | Lock, +	DstMem | SrcImm | ModRM,  	[Group1_82*8] = -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, -	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, +	ByteOp | DstMem | SrcImm | ModRM | No64,  	[Group1_83*8] = -	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, -	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, -	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, -	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM,  	[Group1A*8] =  	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,  	[Group3_Byte*8] = @@ -320,24 +345,39 @@ static u32 group_table[] = {  	SrcMem | ModRM | Stack, 0,  	SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,  	[Group7*8] = -	0, 0, ModRM | SrcMem, ModRM | SrcMem, +	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,  	SrcNone | ModRM | DstMem | Mov, 0, -	SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, +	SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, +	[Group8*8] = +	0, 0, 0, 0, +	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, +	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, +	[Group9*8] = +	0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,  };  static u32 group2_table[] = {  	[Group7*8] = -	SrcNone | ModRM, 0, 0, SrcNone | ModRM, +	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,  	SrcNone | ModRM | DstMem | Mov, 0,  	SrcMem16 | ModRM | Mov, 0, +	[Group9*8] = +	0, 0, 0, 0, 0, 0, 0, 0,  };  /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18)  #define EFLG_VM (1<<17)  #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14)  #define EFLG_OF (1<<11)  #define EFLG_DF (1<<10)  #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8)  #define EFLG_SF (1<<7)  #define EFLG_ZF (1<<6)  #define EFLG_AF (1<<4) @@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,  	if (linear < fc->start || linear >= fc->end) {  		size = min(15UL, PAGE_SIZE - offset_in_page(linear)); -		rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); +		rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);  		if (rc)  			return rc;  		fc->start = linear; @@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,  		op_bytes = 3;  	*address = 0;  	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, -			   ctxt->vcpu); +			   ctxt->vcpu, NULL);  	if (rc)  		return rc;  	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, -			   ctxt->vcpu); +			   ctxt->vcpu, NULL);  	return rc;  } @@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  	switch (mode) {  	case X86EMUL_MODE_REAL: +	case X86EMUL_MODE_VM86:  	case X86EMUL_MODE_PROT16:  		def_op_bytes = def_ad_bytes = 2;  		break; @@ -975,7 +1016,7 @@ done_prefixes:  	}  	if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { -		kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; +		kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");  		return -1;  	} @@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,  	rc = ops->read_emulated(register_address(c, ss_base(ctxt),  						 c->regs[VCPU_REGS_RSP]),  				dest, len, ctxt->vcpu); -	if (rc != 0) +	if (rc != X86EMUL_CONTINUE)  		return rc;  	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);  	return rc;  } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, +		       struct x86_emulate_ops *ops, +		       void *dest, int len) +{ +	int rc; +	unsigned long val, change_mask; +	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; +	int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + +	rc = emulate_pop(ctxt, ops, &val, len); +	if (rc != X86EMUL_CONTINUE) +		return rc; + +	change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF +		| EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + +	switch(ctxt->mode) { +	case X86EMUL_MODE_PROT64: +	case X86EMUL_MODE_PROT32: +	case X86EMUL_MODE_PROT16: +		if (cpl == 0) +			change_mask |= EFLG_IOPL; +		if (cpl <= iopl) +			change_mask |= EFLG_IF; +		break; +	case X86EMUL_MODE_VM86: +		if (iopl < 3) { +			kvm_inject_gp(ctxt->vcpu, 0); +			return X86EMUL_PROPAGATE_FAULT; +		} +		change_mask |= EFLG_IF; +		break; +	default: /* real mode */ +		change_mask |= (EFLG_IOPL | EFLG_IF); +		break; +	} + +	*(unsigned long *)dest = +		(ctxt->eflags & ~change_mask) | (val & change_mask); + +	return rc; +} +  static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)  {  	struct decode_cache *c = &ctxt->decode; @@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,  	if (rc != 0)  		return rc; -	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); +	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);  	return rc;  } @@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,  	int rc;  	rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); -	if (rc != 0) +	if (rc != X86EMUL_CONTINUE)  		return rc;  	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,  		       (u32) c->regs[VCPU_REGS_RBX];  		rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); -		if (rc != 0) +		if (rc != X86EMUL_CONTINUE)  			return rc;  		ctxt->eflags |= EFLG_ZF;  	} @@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,  	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);  	if (rc)  		return rc; -	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); +	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);  	return rc;  } @@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,  					&c->dst.val,  					c->dst.bytes,  					ctxt->vcpu); -		if (rc != 0) +		if (rc != X86EMUL_CONTINUE)  			return rc;  		break;  	case OP_NONE: @@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)  	u64 msr_data;  	/* syscall is not available in real mode */ -	if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL -		|| !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) -		return -1; +	if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) +		return X86EMUL_UNHANDLEABLE;  	setup_syscalls_segments(ctxt, &cs, &ss); @@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)  		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);  	} -	return 0; +	return X86EMUL_CONTINUE;  }  static int @@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)  	struct kvm_segment cs, ss;  	u64 msr_data; -	/* inject #UD if LOCK prefix is used */ -	if (c->lock_prefix) -		return -1; - -	/* inject #GP if in real mode or paging is disabled */ -	if (ctxt->mode == X86EMUL_MODE_REAL || -		!(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { +	/* inject #GP if in real mode */ +	if (ctxt->mode == X86EMUL_MODE_REAL) {  		kvm_inject_gp(ctxt->vcpu, 0); -		return -1; +		return X86EMUL_UNHANDLEABLE;  	}  	/* XXX sysenter/sysexit have not been tested in 64bit mode.  	* Therefore, we inject an #UD.  	*/  	if (ctxt->mode == X86EMUL_MODE_PROT64) -		return -1; +		return X86EMUL_UNHANDLEABLE;  	setup_syscalls_segments(ctxt, &cs, &ss); @@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)  	case X86EMUL_MODE_PROT32:  		if ((msr_data & 0xfffc) == 0x0) {  			kvm_inject_gp(ctxt->vcpu, 0); -			return -1; +			return X86EMUL_PROPAGATE_FAULT;  		}  		break;  	case X86EMUL_MODE_PROT64:  		if (msr_data == 0x0) {  			kvm_inject_gp(ctxt->vcpu, 0); -			return -1; +			return X86EMUL_PROPAGATE_FAULT;  		}  		break;  	} @@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)  	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);  	c->regs[VCPU_REGS_RSP] = msr_data; -	return 0; +	return X86EMUL_CONTINUE;  }  static int @@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)  	u64 msr_data;  	int usermode; -	/* inject #UD if LOCK prefix is used */ -	if (c->lock_prefix) -		return -1; - -	/* inject #GP if in real mode or paging is disabled */ -	if (ctxt->mode == X86EMUL_MODE_REAL -		|| !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { -		kvm_inject_gp(ctxt->vcpu, 0); -		return -1; -	} - -	/* sysexit must be called from CPL 0 */ -	if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { +	/* inject #GP if in real mode or Virtual 8086 mode */ +	if (ctxt->mode == X86EMUL_MODE_REAL || +	    ctxt->mode == X86EMUL_MODE_VM86) {  		kvm_inject_gp(ctxt->vcpu, 0); -		return -1; +		return X86EMUL_UNHANDLEABLE;  	}  	setup_syscalls_segments(ctxt, &cs, &ss); @@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)  		cs.selector = (u16)(msr_data + 16);  		if ((msr_data & 0xfffc) == 0x0) {  			kvm_inject_gp(ctxt->vcpu, 0); -			return -1; +			return X86EMUL_PROPAGATE_FAULT;  		}  		ss.selector = (u16)(msr_data + 24);  		break; @@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)  		cs.selector = (u16)(msr_data + 32);  		if (msr_data == 0x0) {  			kvm_inject_gp(ctxt->vcpu, 0); -			return -1; +			return X86EMUL_PROPAGATE_FAULT;  		}  		ss.selector = cs.selector + 8;  		cs.db = 0; @@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)  	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];  	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; -	return 0; +	return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ +	int iopl; +	if (ctxt->mode == X86EMUL_MODE_REAL) +		return false; +	if (ctxt->mode == X86EMUL_MODE_VM86) +		return true; +	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; +	return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, +					    struct x86_emulate_ops *ops, +					    u16 port, u16 len) +{ +	struct kvm_segment tr_seg; +	int r; +	u16 io_bitmap_ptr; +	u8 perm, bit_idx = port & 0x7; +	unsigned mask = (1 << len) - 1; + +	kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); +	if (tr_seg.unusable) +		return false; +	if (tr_seg.limit < 103) +		return false; +	r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, +			  NULL); +	if (r != X86EMUL_CONTINUE) +		return false; +	if (io_bitmap_ptr + port/8 > tr_seg.limit) +		return false; +	r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, +			  ctxt->vcpu, NULL); +	if (r != X86EMUL_CONTINUE) +		return false; +	if ((perm >> bit_idx) & mask) +		return false; +	return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, +				 struct x86_emulate_ops *ops, +				 u16 port, u16 len) +{ +	if (emulator_bad_iopl(ctxt)) +		if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) +			return false; +	return true;  }  int @@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);  	saved_eip = c->eip; +	/* LOCK prefix is allowed only with some instructions */ +	if (c->lock_prefix && !(c->d & Lock)) { +		kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +		goto done; +	} + +	/* Privileged instruction can be executed only in CPL=0 */ +	if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { +		kvm_inject_gp(ctxt->vcpu, 0); +		goto done; +	} +  	if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))  		memop = c->modrm_ea; @@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  					&c->src.val,  					c->src.bytes,  					ctxt->vcpu); -		if (rc != 0) +		if (rc != X86EMUL_CONTINUE)  			goto done;  		c->src.orig_val = c->src.val;  	} @@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  			c->dst.ptr = (void *)c->dst.ptr +  						   (c->src.val & mask) / 8;  		} -		if (!(c->d & Mov) && -				   /* optimisation - avoid slow emulated read */ -		    ((rc = ops->read_emulated((unsigned long)c->dst.ptr, -					   &c->dst.val, -					  c->dst.bytes, ctxt->vcpu)) != 0)) -			goto done; +		if (!(c->d & Mov)) { +			/* optimisation - avoid slow emulated read */ +			rc = ops->read_emulated((unsigned long)c->dst.ptr, +						&c->dst.val, +						c->dst.bytes, +						ctxt->vcpu); +			if (rc != X86EMUL_CONTINUE) +				goto done; +		}  	}  	c->dst.orig_val = c->dst.val; @@ -1876,7 +2010,12 @@ special_insn:  		break;  	case 0x6c:		/* insb */  	case 0x6d:		/* insw/insd */ -		 if (kvm_emulate_pio_string(ctxt->vcpu, +		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], +					  (c->d & ByteOp) ? 1 : c->op_bytes)) { +			kvm_inject_gp(ctxt->vcpu, 0); +			goto done; +		} +		if (kvm_emulate_pio_string(ctxt->vcpu,  				1,  				(c->d & ByteOp) ? 1 : c->op_bytes,  				c->rep_prefix ? @@ -1892,6 +2031,11 @@ special_insn:  		return 0;  	case 0x6e:		/* outsb */  	case 0x6f:		/* outsw/outsd */ +		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], +					  (c->d & ByteOp) ? 1 : c->op_bytes)) { +			kvm_inject_gp(ctxt->vcpu, 0); +			goto done; +		}  		if (kvm_emulate_pio_string(ctxt->vcpu,  				0,  				(c->d & ByteOp) ? 1 : c->op_bytes, @@ -1978,25 +2122,19 @@ special_insn:  		break;  	case 0x8e: { /* mov seg, r/m16 */  		uint16_t sel; -		int type_bits; -		int err;  		sel = c->src.val; -		if (c->modrm_reg == VCPU_SREG_SS) -			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); -		if (c->modrm_reg <= 5) { -			type_bits = (c->modrm_reg == 1) ? 9 : 1; -			err = kvm_load_segment_descriptor(ctxt->vcpu, sel, -							  type_bits, c->modrm_reg); -		} else { -			printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", -					c->modrm); -			goto cannot_emulate; +		if (c->modrm_reg == VCPU_SREG_CS || +		    c->modrm_reg > VCPU_SREG_GS) { +			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +			goto done;  		} -		if (err < 0) -			goto cannot_emulate; +		if (c->modrm_reg == VCPU_SREG_SS) +			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + +		rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);  		c->dst.type = OP_NONE;  /* Disable writeback. */  		break; @@ -2025,7 +2163,10 @@ special_insn:  		c->dst.type = OP_REG;  		c->dst.ptr = (unsigned long *) &ctxt->eflags;  		c->dst.bytes = c->op_bytes; -		goto pop_instruction; +		rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); +		if (rc != X86EMUL_CONTINUE) +			goto done; +		break;  	case 0xa0 ... 0xa1:	/* mov */  		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];  		c->dst.val = c->src.val; @@ -2039,11 +2180,12 @@ special_insn:  		c->dst.ptr = (unsigned long *)register_address(c,  						   es_base(ctxt),  						   c->regs[VCPU_REGS_RDI]); -		if ((rc = ops->read_emulated(register_address(c, -					   seg_override_base(ctxt, c), -					c->regs[VCPU_REGS_RSI]), +		rc = ops->read_emulated(register_address(c, +						seg_override_base(ctxt, c), +						c->regs[VCPU_REGS_RSI]),  					&c->dst.val, -					c->dst.bytes, ctxt->vcpu)) != 0) +					c->dst.bytes, ctxt->vcpu); +		if (rc != X86EMUL_CONTINUE)  			goto done;  		register_address_increment(c, &c->regs[VCPU_REGS_RSI],  				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2200,11 @@ special_insn:  		c->src.ptr = (unsigned long *)register_address(c,  				       seg_override_base(ctxt, c),  						   c->regs[VCPU_REGS_RSI]); -		if ((rc = ops->read_emulated((unsigned long)c->src.ptr, -						&c->src.val, -						c->src.bytes, -						ctxt->vcpu)) != 0) +		rc = ops->read_emulated((unsigned long)c->src.ptr, +					&c->src.val, +					c->src.bytes, +					ctxt->vcpu); +		if (rc != X86EMUL_CONTINUE)  			goto done;  		c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2212,11 @@ special_insn:  		c->dst.ptr = (unsigned long *)register_address(c,  						   es_base(ctxt),  						   c->regs[VCPU_REGS_RDI]); -		if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, -						&c->dst.val, -						c->dst.bytes, -						ctxt->vcpu)) != 0) +		rc = ops->read_emulated((unsigned long)c->dst.ptr, +					&c->dst.val, +					c->dst.bytes, +					ctxt->vcpu); +		if (rc != X86EMUL_CONTINUE)  			goto done;  		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2246,13 @@ special_insn:  		c->dst.type = OP_REG;  		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;  		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; -		if ((rc = ops->read_emulated(register_address(c, -						 seg_override_base(ctxt, c), -						 c->regs[VCPU_REGS_RSI]), -						 &c->dst.val, -						 c->dst.bytes, -						 ctxt->vcpu)) != 0) +		rc = ops->read_emulated(register_address(c, +						seg_override_base(ctxt, c), +						c->regs[VCPU_REGS_RSI]), +					&c->dst.val, +					c->dst.bytes, +					ctxt->vcpu); +		if (rc != X86EMUL_CONTINUE)  			goto done;  		register_address_increment(c, &c->regs[VCPU_REGS_RSI],  				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2163,11 +2308,9 @@ special_insn:  	case 0xe9: /* jmp rel */  		goto jmp;  	case 0xea: /* jmp far */ -		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, -					VCPU_SREG_CS) < 0) { -			DPRINTF("jmp far: Failed to load CS descriptor\n"); -			goto cannot_emulate; -		} +		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, +						VCPU_SREG_CS)) +			goto done;  		c->eip = c->src.val;  		break; @@ -2185,7 +2328,13 @@ special_insn:  	case 0xef: /* out (e/r)ax,dx */  		port = c->regs[VCPU_REGS_RDX];  		io_dir_in = 0; -	do_io:	if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, +	do_io: +		if (!emulator_io_permited(ctxt, ops, port, +					  (c->d & ByteOp) ? 1 : c->op_bytes)) { +			kvm_inject_gp(ctxt->vcpu, 0); +			goto done; +		} +		if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,  				   (c->d & ByteOp) ? 1 : c->op_bytes,  				   port) != 0) {  			c->eip = saved_eip; @@ -2210,13 +2359,21 @@ special_insn:  		c->dst.type = OP_NONE;	/* Disable writeback. */  		break;  	case 0xfa: /* cli */ -		ctxt->eflags &= ~X86_EFLAGS_IF; -		c->dst.type = OP_NONE;	/* Disable writeback. */ +		if (emulator_bad_iopl(ctxt)) +			kvm_inject_gp(ctxt->vcpu, 0); +		else { +			ctxt->eflags &= ~X86_EFLAGS_IF; +			c->dst.type = OP_NONE;	/* Disable writeback. */ +		}  		break;  	case 0xfb: /* sti */ -		toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); -		ctxt->eflags |= X86_EFLAGS_IF; -		c->dst.type = OP_NONE;	/* Disable writeback. */ +		if (emulator_bad_iopl(ctxt)) +			kvm_inject_gp(ctxt->vcpu, 0); +		else { +			toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); +			ctxt->eflags |= X86_EFLAGS_IF; +			c->dst.type = OP_NONE;	/* Disable writeback. */ +		}  		break;  	case 0xfc: /* cld */  		ctxt->eflags &= ~EFLG_DF; @@ -2319,8 +2476,9 @@ twobyte_insn:  		}  		break;  	case 0x05: 		/* syscall */ -		if (emulate_syscall(ctxt) == -1) -			goto cannot_emulate; +		rc = emulate_syscall(ctxt); +		if (rc != X86EMUL_CONTINUE) +			goto done;  		else  			goto writeback;  		break; @@ -2391,14 +2549,16 @@ twobyte_insn:  		c->dst.type = OP_NONE;  		break;  	case 0x34:		/* sysenter */ -		if (emulate_sysenter(ctxt) == -1) -			goto cannot_emulate; +		rc = emulate_sysenter(ctxt); +		if (rc != X86EMUL_CONTINUE) +			goto done;  		else  			goto writeback;  		break;  	case 0x35:		/* sysexit */ -		if (emulate_sysexit(ctxt) == -1) -			goto cannot_emulate; +		rc = emulate_sysexit(ctxt); +		if (rc != X86EMUL_CONTINUE) +			goto done;  		else  			goto writeback;  		break; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f180e59..294698b6daff 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)  {  	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,  						 irq_ack_notifier); -	spin_lock(&ps->inject_lock); +	raw_spin_lock(&ps->inject_lock);  	if (atomic_dec_return(&ps->pit_timer.pending) < 0)  		atomic_inc(&ps->pit_timer.pending);  	ps->irq_ack = 1; -	spin_unlock(&ps->inject_lock); +	raw_spin_unlock(&ps->inject_lock);  }  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {  	.write    = speaker_ioport_write,  }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */  struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)  {  	struct kvm_pit *pit; @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)  	mutex_init(&pit->pit_state.lock);  	mutex_lock(&pit->pit_state.lock); -	spin_lock_init(&pit->pit_state.inject_lock); +	raw_spin_lock_init(&pit->pit_state.inject_lock);  	kvm->arch.vpit = pit;  	pit->kvm = kvm; @@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)  	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);  	kvm_iodevice_init(&pit->dev, &pit_dev_ops); -	ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); +	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);  	if (ret < 0)  		goto fail;  	if (flags & KVM_PIT_SPEAKER_DUMMY) {  		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); -		ret = __kvm_io_bus_register_dev(&kvm->pio_bus, +		ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,  						&pit->speaker_dev);  		if (ret < 0)  			goto fail_unregister; @@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)  	return pit;  fail_unregister: -	__kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); +	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);  fail: -	if (pit->irq_source_id >= 0) -		kvm_free_irq_source_id(kvm, pit->irq_source_id); +	kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); +	kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); +	kvm_free_irq_source_id(kvm, pit->irq_source_id);  	kfree(pit);  	return NULL; @@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)  		/* Try to inject pending interrupts when  		 * last one has been acked.  		 */ -		spin_lock(&ps->inject_lock); +		raw_spin_lock(&ps->inject_lock);  		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {  			ps->irq_ack = 0;  			inject = 1;  		} -		spin_unlock(&ps->inject_lock); +		raw_spin_unlock(&ps->inject_lock);  		if (inject)  			__inject_pit_timer_intr(kvm);  	} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc09..900d6b0ba7c2 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state {  	u32    speaker_data_on;  	struct mutex lock;  	struct kvm_pit *pit; -	spinlock_t inject_lock; +	raw_spinlock_t inject_lock;  	unsigned long irq_ack;  	struct kvm_irq_ack_notifier irq_ack_notifier;  }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd245..07771da85de5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)  	 * Other interrupt may be delivered to PIC while lock is dropped but  	 * it should be safe since PIC state is already updated at this stage.  	 */ -	spin_unlock(&s->pics_state->lock); +	raw_spin_unlock(&s->pics_state->lock);  	kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); -	spin_lock(&s->pics_state->lock); +	raw_spin_lock(&s->pics_state->lock);  }  void kvm_pic_clear_isr_ack(struct kvm *kvm)  {  	struct kvm_pic *s = pic_irqchip(kvm); -	spin_lock(&s->lock); + +	raw_spin_lock(&s->lock);  	s->pics[0].isr_ack = 0xff;  	s->pics[1].isr_ack = 0xff; -	spin_unlock(&s->lock); +	raw_spin_unlock(&s->lock);  }  /* @@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s)  void kvm_pic_update_irq(struct kvm_pic *s)  { -	spin_lock(&s->lock); +	raw_spin_lock(&s->lock);  	pic_update_irq(s); -	spin_unlock(&s->lock); +	raw_spin_unlock(&s->lock);  }  int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)  	struct kvm_pic *s = opaque;  	int ret = -1; -	spin_lock(&s->lock); +	raw_spin_lock(&s->lock);  	if (irq >= 0 && irq < PIC_NUM_PINS) {  		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);  		pic_update_irq(s);  		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,  				      s->pics[irq >> 3].imr, ret == 0);  	} -	spin_unlock(&s->lock); +	raw_spin_unlock(&s->lock);  	return ret;  } @@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm)  	int irq, irq2, intno;  	struct kvm_pic *s = pic_irqchip(kvm); -	spin_lock(&s->lock); +	raw_spin_lock(&s->lock);  	irq = pic_get_irq(&s->pics[0]);  	if (irq >= 0) {  		pic_intack(&s->pics[0], irq); @@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm)  		intno = s->pics[0].irq_base + irq;  	}  	pic_update_irq(s); -	spin_unlock(&s->lock); +	raw_spin_unlock(&s->lock);  	return intno;  } @@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this,  			printk(KERN_ERR "PIC: non byte write\n");  		return 0;  	} -	spin_lock(&s->lock); +	raw_spin_lock(&s->lock);  	switch (addr) {  	case 0x20:  	case 0x21: @@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this,  		elcr_ioport_write(&s->pics[addr & 1], addr, data);  		break;  	} -	spin_unlock(&s->lock); +	raw_spin_unlock(&s->lock);  	return 0;  } @@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this,  			printk(KERN_ERR "PIC: non byte read\n");  		return 0;  	} -	spin_lock(&s->lock); +	raw_spin_lock(&s->lock);  	switch (addr) {  	case 0x20:  	case 0x21: @@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this,  		break;  	}  	*(unsigned char *)val = data; -	spin_unlock(&s->lock); +	raw_spin_unlock(&s->lock);  	return 0;  } @@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)  	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);  	if (!s)  		return NULL; -	spin_lock_init(&s->lock); +	raw_spin_lock_init(&s->lock);  	s->kvm = kvm;  	s->pics[0].elcr_mask = 0xf8;  	s->pics[1].elcr_mask = 0xde; @@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)  	 * Initialize PIO device  	 */  	kvm_iodevice_init(&s->dev, &picdev_ops); -	ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); +	mutex_lock(&kvm->slots_lock); +	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); +	mutex_unlock(&kvm->slots_lock);  	if (ret < 0) {  		kfree(s);  		return NULL; @@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)  	return s;  } + +void kvm_destroy_pic(struct kvm *kvm) +{ +	struct kvm_pic *vpic = kvm->arch.vpic; + +	if (vpic) { +		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); +		kvm->arch.vpic = NULL; +		kfree(vpic); +	} +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d57..34b15915754d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state {  };  struct kvm_pic { -	spinlock_t lock; +	raw_spinlock_t lock;  	unsigned pending_acks;  	struct kvm *kvm;  	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -75,6 +75,7 @@ struct kvm_pic {  };  struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm);  int kvm_pic_read_irq(struct kvm *kvm);  void kvm_pic_update_irq(struct kvm_pic *s);  void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a4403..cff851cf5322 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@  #ifndef ASM_KVM_CACHE_REGS_H  #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS				  \ +	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR  \ +	 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) +  static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,  					      enum kvm_reg reg)  { @@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)  	return vcpu->arch.pdptrs[index];  } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ +	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; +	if (tmask & vcpu->arch.cr0_guest_owned_bits) +		kvm_x86_ops->decache_cr0_guest_bits(vcpu); +	return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ +	return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ +	ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; +	if (tmask & vcpu->arch.cr4_guest_owned_bits) +		kvm_x86_ops->decache_cr4_guest_bits(vcpu); +	return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ +	return kvm_read_cr4_bits(vcpu, ~0UL); +} +  #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045da782..4b224f90087b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)  	return 0;  } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ +	struct kvm_lapic *apic = vcpu->arch.apic; + +	if (!irqchip_in_kernel(vcpu->kvm)) +		return 1; + +	/* if this is ICR write vector before command */ +	if (reg == APIC_ICR) +		apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); +	return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ +	struct kvm_lapic *apic = vcpu->arch.apic; +	u32 low, high = 0; + +	if (!irqchip_in_kernel(vcpu->kvm)) +		return 1; + +	if (apic_reg_read(apic, reg, 4, &low)) +		return 1; +	if (reg == APIC_ICR) +		apic_reg_read(apic, APIC_ICR2, 4, &high); + +	*data = (((u64)high) << 32) | low; + +	return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4aa..f5fe32c5edad 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);  int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);  int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ +	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +}  #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb46a27..741373e8ca77 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@   */  #include "mmu.h" +#include "x86.h"  #include "kvm_cache_regs.h"  #include <linux/kvm_host.h> @@ -29,6 +30,7 @@  #include <linux/swap.h>  #include <linux/hugetlb.h>  #include <linux/compiler.h> +#include <linux/srcu.h>  #include <asm/page.h>  #include <asm/cmpxchg.h> @@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644);  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \  			| PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 -  #define RMAP_EXT 4  #define ACC_EXEC_MASK    1 @@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644);  #define ACC_USER_MASK    PT_USER_MASK  #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include <trace/events/kvm.h> + +#undef TRACE_INCLUDE_FILE  #define CREATE_TRACE_POINTS  #include "mmutrace.h" @@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);  static int is_write_protection(struct kvm_vcpu *vcpu)  { -	return vcpu->arch.cr0 & X86_CR0_WP; +	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);  }  static int is_cpuid_PSE36(void) @@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void)  static int is_nx(struct kvm_vcpu *vcpu)  { -	return vcpu->arch.shadow_efer & EFER_NX; +	return vcpu->arch.efer & EFER_NX;  }  static int is_shadow_present_pte(u64 pte) @@ -253,7 +248,7 @@ static int is_large_pte(u64 pte)  	return pte & PT_PAGE_SIZE_MASK;  } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte)  {  	return pte & PT_WRITABLE_MASK;  } @@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm,  static int host_mapping_level(struct kvm *kvm, gfn_t gfn)  { -	unsigned long page_size = PAGE_SIZE; -	struct vm_area_struct *vma; -	unsigned long addr; +	unsigned long page_size;  	int i, ret = 0; -	addr = gfn_to_hva(kvm, gfn); -	if (kvm_is_error_hva(addr)) -		return PT_PAGE_TABLE_LEVEL; - -	down_read(¤t->mm->mmap_sem); -	vma = find_vma(current->mm, addr); -	if (!vma) -		goto out; - -	page_size = vma_kernel_pagesize(vma); - -out: -	up_read(¤t->mm->mmap_sem); +	page_size = kvm_host_page_size(kvm, gfn);  	for (i = PT_PAGE_TABLE_LEVEL;  	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +484,7 @@ out:  static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)  {  	struct kvm_memory_slot *slot; -	int host_level; -	int level = PT_PAGE_TABLE_LEVEL; +	int host_level, level, max_level;  	slot = gfn_to_memslot(vcpu->kvm, large_gfn);  	if (slot && slot->dirty_bitmap) @@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)  	if (host_level == PT_PAGE_TABLE_LEVEL)  		return host_level; -	for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) +	max_level = kvm_x86_ops->get_lpage_level() < host_level ? +		kvm_x86_ops->get_lpage_level() : host_level; + +	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)  		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))  			break; @@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)  	pfn = spte_to_pfn(*spte);  	if (*spte & shadow_accessed_mask)  		kvm_set_pfn_accessed(pfn); -	if (is_writeble_pte(*spte)) +	if (is_writable_pte(*spte))  		kvm_set_pfn_dirty(pfn);  	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);  	if (!*rmapp) { @@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)  			prev_desc = desc;  			desc = desc->more;  		} +		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);  		BUG();  	}  } @@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  		BUG_ON(!spte);  		BUG_ON(!(*spte & PT_PRESENT_MASK));  		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); -		if (is_writeble_pte(*spte)) { +		if (is_writable_pte(*spte)) {  			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);  			write_protected = 1;  		} @@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  			BUG_ON(!(*spte & PT_PRESENT_MASK));  			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));  			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); -			if (is_writeble_pte(*spte)) { +			if (is_writable_pte(*spte)) {  				rmap_remove(kvm, spte);  				--kvm->stat.lpages;  				__set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,  			new_spte &= ~PT_WRITABLE_MASK;  			new_spte &= ~SPTE_HOST_WRITEABLE; -			if (is_writeble_pte(*spte)) +			if (is_writable_pte(*spte))  				kvm_set_pfn_dirty(spte_to_pfn(*spte));  			__set_spte(spte, new_spte);  			spte = rmap_next(kvm, rmapp, spte); @@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,  					 unsigned long data))  {  	int i, j; +	int ret;  	int retval = 0; +	struct kvm_memslots *slots; -	/* -	 * If mmap_sem isn't taken, we can look the memslots with only -	 * the mmu_lock by skipping over the slots with userspace_addr == 0. -	 */ -	for (i = 0; i < kvm->nmemslots; i++) { -		struct kvm_memory_slot *memslot = &kvm->memslots[i]; +	slots = rcu_dereference(kvm->memslots); + +	for (i = 0; i < slots->nmemslots; i++) { +		struct kvm_memory_slot *memslot = &slots->memslots[i];  		unsigned long start = memslot->userspace_addr;  		unsigned long end; -		/* mmu_lock protects userspace_addr */ -		if (!start) -			continue; -  		end = start + (memslot->npages << PAGE_SHIFT);  		if (hva >= start && hva < end) {  			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; -			retval |= handler(kvm, &memslot->rmap[gfn_offset], -					  data); +			ret = handler(kvm, &memslot->rmap[gfn_offset], data);  			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {  				int idx = gfn_offset;  				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); -				retval |= handler(kvm, +				ret |= handler(kvm,  					&memslot->lpage_info[j][idx].rmap_pde,  					data);  			} +			trace_kvm_age_page(hva, memslot, ret); +			retval |= ret;  		}  	} @@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,  	u64 *spte;  	int young = 0; -	/* always return old for EPT */ +	/* +	 * Emulate the accessed bit for EPT, by checking if this page has +	 * an EPT mapping, and clearing it if it does. On the next access, +	 * a new EPT mapping will be established. +	 * This has some overhead, but not as much as the cost of swapping +	 * out actively used pages or breaking up actively used hugepages. +	 */  	if (!shadow_accessed_mask) -		return 0; +		return kvm_unmap_rmapp(kvm, rmapp, data);  	spte = rmap_next(kvm, rmapp, NULL);  	while (spte) { @@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)  static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)  { -	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); +	int slot = memslot_id(kvm, gfn);  	struct kvm_mmu_page *sp = page_header(__pa(pte));  	__set_bit(slot, sp->slot_bitmap); @@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)  {  	struct page *page; -	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); +	gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);  	if (gpa == UNMAPPED_GVA)  		return NULL; @@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		 * is responsibility of mmu_get_page / kvm_sync_page.  		 * Same reasoning can be applied to dirty page accounting.  		 */ -		if (!can_unsync && is_writeble_pte(*sptep)) +		if (!can_unsync && is_writable_pte(*sptep))  			goto set_pte;  		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  				 __func__, gfn);  			ret = 1;  			pte_access &= ~ACC_WRITE_MASK; -			if (is_writeble_pte(spte)) +			if (is_writable_pte(spte))  				spte &= ~PT_WRITABLE_MASK;  		}  	} @@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  			 bool reset_host_protection)  {  	int was_rmapped = 0; -	int was_writeble = is_writeble_pte(*sptep); +	int was_writable = is_writable_pte(*sptep);  	int rmap_count;  	pgprintk("%s: spte %llx access %x write_fault %d" @@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		if (rmap_count > RMAP_RECYCLE_THRESHOLD)  			rmap_recycle(vcpu, sptep, gfn);  	} else { -		if (was_writeble) +		if (was_writable)  			kvm_release_pfn_dirty(pfn);  		else  			kvm_release_pfn_clean(pfn); @@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)  	spin_unlock(&vcpu->kvm->mmu_lock);  } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, +				  u32 access, u32 *error)  { +	if (error) +		*error = 0;  	return vaddr;  } @@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)  	if (tdp_enabled)  		return 0; -	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); +	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);  	spin_lock(&vcpu->kvm->mmu_lock);  	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)  	 */  	page = alloc_page(GFP_KERNEL | __GFP_DMA32);  	if (!page) -		goto error_1; +		return -ENOMEM; +  	vcpu->arch.mmu.pae_root = page_address(page);  	for (i = 0; i < 4; ++i)  		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;  	return 0; - -error_1: -	free_mmu_pages(vcpu); -	return -ENOMEM;  }  int kvm_mmu_create(struct kvm_vcpu *vcpu) @@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)  	spin_lock(&kvm_lock);  	list_for_each_entry(kvm, &vm_list, vm_list) { -		int npages; +		int npages, idx; -		if (!down_read_trylock(&kvm->slots_lock)) -			continue; +		idx = srcu_read_lock(&kvm->srcu);  		spin_lock(&kvm->mmu_lock);  		npages = kvm->arch.n_alloc_mmu_pages -  			 kvm->arch.n_free_mmu_pages; @@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)  		nr_to_scan--;  		spin_unlock(&kvm->mmu_lock); -		up_read(&kvm->slots_lock); +		srcu_read_unlock(&kvm->srcu, idx);  	}  	if (kvm_freed)  		list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)  	int i;  	unsigned int nr_mmu_pages;  	unsigned int  nr_pages = 0; +	struct kvm_memslots *slots; -	for (i = 0; i < kvm->nmemslots; i++) -		nr_pages += kvm->memslots[i].npages; +	slots = rcu_dereference(kvm->memslots); +	for (i = 0; i < slots->nmemslots; i++) +		nr_pages += slots->memslots[i].npages;  	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;  	nr_mmu_pages = max(nr_mmu_pages, @@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,  		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))  			audit_mappings_page(vcpu, ent, va, level - 1);  		else { -			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); +			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);  			gfn_t gfn = gpa >> PAGE_SHIFT;  			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);  			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)  static int count_rmaps(struct kvm_vcpu *vcpu)  {  	int nmaps = 0; -	int i, j, k; +	int i, j, k, idx; +	idx = srcu_read_lock(&kvm->srcu); +	slots = rcu_dereference(kvm->memslots);  	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { -		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; +		struct kvm_memory_slot *m = &slots->memslots[i];  		struct kvm_rmap_desc *d;  		for (j = 0; j < m->npages; ++j) { @@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)  			}  		}  	} +	srcu_read_unlock(&kvm->srcu, idx);  	return nmaps;  } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b49..be66759321a5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@  #define __KVM_X86_MMU_H  #include <linux/kvm_host.h> +#include "kvm_cache_regs.h"  #define PT64_PT_BITS 9  #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,6 +38,16 @@  #define PT32_ROOT_LEVEL 2  #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) +  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);  static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) @@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)  	return kvm_mmu_load(vcpu);  } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 -	return vcpu->arch.shadow_efer & EFER_LMA; -#else -	return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ -	return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ -	return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ -	return vcpu->arch.cr0 & X86_CR0_PG; -} -  static inline int is_present_gpte(unsigned long pte)  {  	return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131a9225..81eab9a50e6a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk:  		if (rsvd_fault)  			goto access_error; -		if (write_fault && !is_writeble_pte(pte)) +		if (write_fault && !is_writable_pte(pte))  			if (user_fault || is_write_protection(vcpu))  				goto access_error; @@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)  	spin_unlock(&vcpu->kvm->mmu_lock);  } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, +			       u32 *error)  {  	struct guest_walker walker;  	gpa_t gpa = UNMAPPED_GVA;  	int r; -	r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); +	r = FNAME(walk_addr)(&walker, vcpu, vaddr, +			     !!(access & PFERR_WRITE_MASK), +			     !!(access & PFERR_USER_MASK), +			     !!(access & PFERR_FETCH_MASK));  	if (r) {  		gpa = gfn_to_gpa(walker.gfn);  		gpa |= vaddr & ~PAGE_MASK; -	} +	} else if (error) +		*error = walker.error_code;  	return gpa;  } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c80..52f78dd03010 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)  		efer &= ~EFER_LME;  	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; -	vcpu->arch.shadow_efer = efer; +	vcpu->arch.efer = efer;  }  static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm)  	struct vmcb_control_area *control = &svm->vmcb->control;  	struct vmcb_save_area *save = &svm->vmcb->save; +	svm->vcpu.fpu_active = 1; +  	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |  					INTERCEPT_CR3_MASK |  					INTERCEPT_CR4_MASK; @@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm)  	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |  					INTERCEPT_DR1_MASK |  					INTERCEPT_DR2_MASK | -					INTERCEPT_DR3_MASK; +					INTERCEPT_DR3_MASK | +					INTERCEPT_DR4_MASK | +					INTERCEPT_DR5_MASK | +					INTERCEPT_DR6_MASK | +					INTERCEPT_DR7_MASK;  	control->intercept_dr_write = 	INTERCEPT_DR0_MASK |  					INTERCEPT_DR1_MASK |  					INTERCEPT_DR2_MASK |  					INTERCEPT_DR3_MASK | +					INTERCEPT_DR4_MASK |  					INTERCEPT_DR5_MASK | +					INTERCEPT_DR6_MASK |  					INTERCEPT_DR7_MASK;  	control->intercept_exceptions = (1 << PF_VECTOR) | @@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)  	control->intercept = 	(1ULL << INTERCEPT_INTR) |  				(1ULL << INTERCEPT_NMI) |  				(1ULL << INTERCEPT_SMI) | +				(1ULL << INTERCEPT_SELECTIVE_CR0) |  				(1ULL << INTERCEPT_CPUID) |  				(1ULL << INTERCEPT_INVD) |  				(1ULL << INTERCEPT_HLT) | @@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm)  		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |  					(1ULL << INTERCEPT_INVLPG));  		control->intercept_exceptions &= ~(1 << PF_VECTOR); -		control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| -						INTERCEPT_CR3_MASK); -		control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| -						 INTERCEPT_CR3_MASK); +		control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; +		control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;  		save->g_pat = 0x0007040600070406ULL;  		save->cr3 = 0;  		save->cr4 = 0; @@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)  	init_vmcb(svm);  	fx_init(&svm->vcpu); -	svm->vcpu.fpu_active = 1;  	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;  	if (kvm_vcpu_is_bsp(&svm->vcpu))  		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  	if (unlikely(cpu != vcpu->cpu)) {  		u64 delta; -		/* -		 * Make sure that the guest sees a monotonically -		 * increasing TSC. -		 */ -		delta = vcpu->arch.host_tsc - native_read_tsc(); -		svm->vmcb->control.tsc_offset += delta; -		if (is_nested(svm)) -			svm->nested.hsave->control.tsc_offset += delta; +		if (check_tsc_unstable()) { +			/* +			 * Make sure that the guest sees a monotonically +			 * increasing TSC. +			 */ +			delta = vcpu->arch.host_tsc - native_read_tsc(); +			svm->vmcb->control.tsc_offset += delta; +			if (is_nested(svm)) +				svm->nested.hsave->control.tsc_offset += delta; +		}  		vcpu->cpu = cpu;  		kvm_migrate_timers(vcpu);  		svm->asid_generation = 0; @@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)  	svm->vmcb->save.gdtr.base = dt->base ;  } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} +  static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)  {  } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ +	ulong gcr0 = svm->vcpu.arch.cr0; +	u64 *hcr0 = &svm->vmcb->save.cr0; + +	if (!svm->vcpu.fpu_active) +		*hcr0 |= SVM_CR0_SELECTIVE_MASK; +	else +		*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) +			| (gcr0 & SVM_CR0_SELECTIVE_MASK); + + +	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { +		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; +		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; +	} else { +		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; +		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; +	} +} +  static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  {  	struct vcpu_svm *svm = to_svm(vcpu);  #ifdef CONFIG_X86_64 -	if (vcpu->arch.shadow_efer & EFER_LME) { +	if (vcpu->arch.efer & EFER_LME) {  		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { -			vcpu->arch.shadow_efer |= EFER_LMA; +			vcpu->arch.efer |= EFER_LMA;  			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;  		}  		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { -			vcpu->arch.shadow_efer &= ~EFER_LMA; +			vcpu->arch.efer &= ~EFER_LMA;  			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);  		}  	}  #endif -	if (npt_enabled) -		goto set; +	vcpu->arch.cr0 = cr0; -	if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { -		svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); -		vcpu->fpu_active = 1; -	} +	if (!npt_enabled) +		cr0 |= X86_CR0_PG | X86_CR0_WP; -	vcpu->arch.cr0 = cr0; -	cr0 |= X86_CR0_PG | X86_CR0_WP; -	if (!vcpu->fpu_active) { -		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); +	if (!vcpu->fpu_active)  		cr0 |= X86_CR0_TS; -	} -set:  	/*  	 * re-enable caching here because the QEMU bios  	 * does not do it - this results in some delay at @@ -997,6 +1022,7 @@ set:  	 */  	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);  	svm->vmcb->save.cr0 = cr0; +	update_cr0_intercept(svm);  }  static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)  	svm->vmcb->control.asid = sd->next_asid++;  } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	unsigned long val;  	switch (dr) {  	case 0 ... 3: -		val = vcpu->arch.db[dr]; +		*dest = vcpu->arch.db[dr];  		break; +	case 4: +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) +			return EMULATE_FAIL; /* will re-inject UD */ +		/* fall through */  	case 6:  		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) -			val = vcpu->arch.dr6; +			*dest = vcpu->arch.dr6;  		else -			val = svm->vmcb->save.dr6; +			*dest = svm->vmcb->save.dr6;  		break; +	case 5: +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) +			return EMULATE_FAIL; /* will re-inject UD */ +		/* fall through */  	case 7:  		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) -			val = vcpu->arch.dr7; +			*dest = vcpu->arch.dr7;  		else -			val = svm->vmcb->save.dr7; +			*dest = svm->vmcb->save.dr7;  		break; -	default: -		val = 0;  	} -	return val; +	return EMULATE_DONE;  } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, -		       int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)  {  	struct vcpu_svm *svm = to_svm(vcpu); -	*exception = 0; -  	switch (dr) {  	case 0 ... 3:  		vcpu->arch.db[dr] = value;  		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))  			vcpu->arch.eff_db[dr] = value; -		return; -	case 4 ... 5: -		if (vcpu->arch.cr4 & X86_CR4_DE) -			*exception = UD_VECTOR; -		return; +		break; +	case 4: +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) +			return EMULATE_FAIL; /* will re-inject UD */ +		/* fall through */  	case 6: -		if (value & 0xffffffff00000000ULL) { -			*exception = GP_VECTOR; -			return; -		}  		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; -		return; +		break; +	case 5: +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) +			return EMULATE_FAIL; /* will re-inject UD */ +		/* fall through */  	case 7: -		if (value & 0xffffffff00000000ULL) { -			*exception = GP_VECTOR; -			return; -		}  		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;  		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {  			svm->vmcb->save.dr7 = vcpu->arch.dr7;  			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);  		} -		return; -	default: -		/* FIXME: Possible case? */ -		printk(KERN_DEBUG "%s: unexpected dr %u\n", -		       __func__, dr); -		*exception = UD_VECTOR; -		return; +		break;  	} + +	return EMULATE_DONE;  }  static int pf_interception(struct vcpu_svm *svm) @@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm)  	return 1;  } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu)  { +	struct vcpu_svm *svm = to_svm(vcpu);  	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); -	if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) -		svm->vmcb->save.cr0 &= ~X86_CR0_TS;  	svm->vcpu.fpu_active = 1; +	update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ +	svm_fpu_activate(&svm->vcpu);  	return 1;  } @@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm)  static int nested_svm_check_permissions(struct vcpu_svm *svm)  { -	if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) +	if (!(svm->vcpu.arch.efer & EFER_SVME)  	    || !is_paging(&svm->vcpu)) {  		kvm_queue_exception(&svm->vcpu, UD_VECTOR);  		return 1; @@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  	hsave->save.ds     = vmcb->save.ds;  	hsave->save.gdtr   = vmcb->save.gdtr;  	hsave->save.idtr   = vmcb->save.idtr; -	hsave->save.efer   = svm->vcpu.arch.shadow_efer; -	hsave->save.cr0    = svm->vcpu.arch.cr0; +	hsave->save.efer   = svm->vcpu.arch.efer; +	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);  	hsave->save.cr4    = svm->vcpu.arch.cr4;  	hsave->save.rflags = vmcb->save.rflags;  	hsave->save.rip    = svm->next_rip; @@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm)  	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];  	u64 data; -	if (svm_get_msr(&svm->vcpu, ecx, &data)) +	if (svm_get_msr(&svm->vcpu, ecx, &data)) { +		trace_kvm_msr_read_ex(ecx);  		kvm_inject_gp(&svm->vcpu, 0); -	else { +	} else {  		trace_kvm_msr_read(ecx, data);  		svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm)  	u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)  		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); -	trace_kvm_msr_write(ecx, data);  	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; -	if (svm_set_msr(&svm->vcpu, ecx, data)) +	if (svm_set_msr(&svm->vcpu, ecx, data)) { +		trace_kvm_msr_write_ex(ecx, data);  		kvm_inject_gp(&svm->vcpu, 0); -	else +	} else { +		trace_kvm_msr_write(ecx, data);  		skip_emulated_instruction(&svm->vcpu); +	}  	return 1;  } @@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {  	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,  	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,  	[SVM_EXIT_READ_CR8]           		= emulate_on_interception, -	/* for now: */ +	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,  	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,  	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,  	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception, @@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {  	[SVM_EXIT_READ_DR1]			= emulate_on_interception,  	[SVM_EXIT_READ_DR2]			= emulate_on_interception,  	[SVM_EXIT_READ_DR3]			= emulate_on_interception, +	[SVM_EXIT_READ_DR4]			= emulate_on_interception, +	[SVM_EXIT_READ_DR5]			= emulate_on_interception, +	[SVM_EXIT_READ_DR6]			= emulate_on_interception, +	[SVM_EXIT_READ_DR7]			= emulate_on_interception,  	[SVM_EXIT_WRITE_DR0]			= emulate_on_interception,  	[SVM_EXIT_WRITE_DR1]			= emulate_on_interception,  	[SVM_EXIT_WRITE_DR2]			= emulate_on_interception,  	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception, +	[SVM_EXIT_WRITE_DR4]			= emulate_on_interception,  	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception, +	[SVM_EXIT_WRITE_DR6]			= emulate_on_interception,  	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,  	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,  	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception, @@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)  	svm_complete_interrupts(svm); -	if (npt_enabled) { -		int mmu_reload = 0; -		if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { -			svm_set_cr0(vcpu, svm->vmcb->save.cr0); -			mmu_reload = 1; -		} +	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))  		vcpu->arch.cr0 = svm->vmcb->save.cr0; +	if (npt_enabled)  		vcpu->arch.cr3 = svm->vmcb->save.cr3; -		if (mmu_reload) { -			kvm_mmu_reset_context(vcpu); -			kvm_mmu_load(vcpu); -		} -	} -  	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {  		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)  	svm->vmcb->save.cr3 = root;  	force_new_asid(vcpu); - -	if (vcpu->fpu_active) { -		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); -		svm->vmcb->save.cr0 |= X86_CR0_TS; -		vcpu->fpu_active = 0; -	}  }  static int is_disabled(void) @@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)  	return 0;  } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} +  static const struct trace_print_flags svm_exit_reasons_str[] = {  	{ SVM_EXIT_READ_CR0,           		"read_cr0" },  	{ SVM_EXIT_READ_CR3,	      		"read_cr3" }, @@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {  	{ -1, NULL }  }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void)  { -	return true; +	return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ +	return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ +	struct vcpu_svm *svm = to_svm(vcpu); + +	update_cr0_intercept(svm); +	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;  }  static struct kvm_x86_ops svm_x86_ops = { @@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = {  	.set_segment = svm_set_segment,  	.get_cpl = svm_get_cpl,  	.get_cs_db_l_bits = kvm_get_cs_db_l_bits, +	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,  	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,  	.set_cr0 = svm_set_cr0,  	.set_cr3 = svm_set_cr3, @@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = {  	.cache_reg = svm_cache_reg,  	.get_rflags = svm_get_rflags,  	.set_rflags = svm_set_rflags, +	.fpu_activate = svm_fpu_activate, +	.fpu_deactivate = svm_fpu_deactivate,  	.tlb_flush = svm_flush_tlb, @@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = {  	.get_mt_mask = svm_get_mt_mask,  	.exit_reasons_str = svm_exit_reasons_str, -	.gb_page_enable = svm_gb_page_enable, +	.get_lpage_level = svm_get_lpage_level, + +	.cpuid_update = svm_cpuid_update, + +	.rdtscp_supported = svm_rdtscp_supported,  };  static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0b..6ad30a29f044 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,  );  /* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, +	TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, +		 __u64 ingpa, __u64 outgpa), +	TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + +	TP_STRUCT__entry( +		__field(	__u16, 		code		) +		__field(	bool,		fast		) +		__field(	__u16,		rep_cnt		) +		__field(	__u16,		rep_idx		) +		__field(	__u64,		ingpa		) +		__field(	__u64,		outgpa		) +	), + +	TP_fast_assign( +		__entry->code		= code; +		__entry->fast		= fast; +		__entry->rep_cnt	= rep_cnt; +		__entry->rep_idx	= rep_idx; +		__entry->ingpa		= ingpa; +		__entry->outgpa		= outgpa; +	), + +	TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", +		  __entry->code, __entry->fast ? "fast" : "slow", +		  __entry->rep_cnt, __entry->rep_idx,  __entry->ingpa, +		  __entry->outgpa) +); + +/*   * Tracepoint for PIO.   */  TRACE_EVENT(kvm_pio, @@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,   * Tracepoint for guest MSR access.   */  TRACE_EVENT(kvm_msr, -	TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), -	TP_ARGS(rw, ecx, data), +	TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), +	TP_ARGS(write, ecx, data, exception),  	TP_STRUCT__entry( -		__field(	unsigned int,	rw		) -		__field(	unsigned int,	ecx		) -		__field(	unsigned long,	data		) +		__field(	unsigned,	write		) +		__field(	u32,		ecx		) +		__field(	u64,		data		) +		__field(	u8,		exception	)  	),  	TP_fast_assign( -		__entry->rw		= rw; +		__entry->write		= write;  		__entry->ecx		= ecx;  		__entry->data		= data; +		__entry->exception	= exception;  	), -	TP_printk("msr_%s %x = 0x%lx", -		  __entry->rw ? "write" : "read", -		  __entry->ecx, __entry->data) +	TP_printk("msr_%s %x = 0x%llx%s", +		  __entry->write ? "write" : "read", +		  __entry->ecx, __entry->data, +		  __entry->exception ? " (#GP)" : "")  ); -#define trace_kvm_msr_read(ecx, data)		trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data)		trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data)      trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data)     trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx)         trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data)  trace_kvm_msr(1, ecx, data, true)  /*   * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc924..14873b9f8430 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,21 @@ module_param_named(unrestricted_guest,  static int __read_mostly emulate_invalid_guest_state = 0;  module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\ +	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK						\ +	(KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST				\ +	(X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON						\ +	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS				      \ +	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \ +	 | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) +  /*   * These 2 parameters are used to config the controls for Pause-Loop Exiting:   * ple_gap:    upper bound on the amount of time between two successive @@ -136,6 +151,8 @@ struct vcpu_vmx {  	ktime_t entry_time;  	s64 vnmi_blocked_time;  	u32 exit_reason; + +	bool rdtscp_enabled;  };  static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = {  #ifdef CONFIG_X86_64  	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,  #endif -	MSR_EFER, MSR_K6_STAR, +	MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,  };  #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)  	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);  } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ +	return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} +  static inline int cpu_has_vmx_invept_individual_addr(void)  {  	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void)  static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)  { -	return flexpriority_enabled && -		(cpu_has_vmx_virtualize_apic_accesses()) && -		(irqchip_in_kernel(kvm)); +	return flexpriority_enabled && irqchip_in_kernel(kvm);  }  static inline int cpu_has_vmx_vpid(void) @@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void)  		SECONDARY_EXEC_ENABLE_VPID;  } +static inline int cpu_has_vmx_rdtscp(void) +{ +	return vmcs_config.cpu_based_2nd_exec_ctrl & +		SECONDARY_EXEC_RDTSCP; +} +  static inline int cpu_has_virtual_nmis(void)  {  	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)  {  	u32 eb; -	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); -	if (!vcpu->fpu_active) -		eb |= 1u << NM_VECTOR; -	/* -	 * Unconditionally intercept #DB so we can maintain dr6 without -	 * reading it every exit. -	 */ -	eb |= 1u << DB_VECTOR; -	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { -		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) -			eb |= 1u << BP_VECTOR; -	} +	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | +	     (1u << NM_VECTOR) | (1u << DB_VECTOR); +	if ((vcpu->guest_debug & +	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == +	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) +		eb |= 1u << BP_VECTOR;  	if (to_vmx(vcpu)->rmode.vm86_active)  		eb = ~0;  	if (enable_ept)  		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ +	if (vcpu->fpu_active) +		eb &= ~(1u << NM_VECTOR);  	vmcs_write32(EXCEPTION_BITMAP, eb);  } @@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)  	u64 guest_efer;  	u64 ignore_bits; -	guest_efer = vmx->vcpu.arch.shadow_efer; +	guest_efer = vmx->vcpu.arch.efer;  	/*  	 * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)  static void vmx_fpu_activate(struct kvm_vcpu *vcpu)  { +	ulong cr0; +  	if (vcpu->fpu_active)  		return;  	vcpu->fpu_active = 1; -	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); -	if (vcpu->arch.cr0 & X86_CR0_TS) -		vmcs_set_bits(GUEST_CR0, X86_CR0_TS); +	cr0 = vmcs_readl(GUEST_CR0); +	cr0 &= ~(X86_CR0_TS | X86_CR0_MP); +	cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); +	vmcs_writel(GUEST_CR0, cr0);  	update_exception_bitmap(vcpu); +	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; +	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);  } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); +  static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)  { -	if (!vcpu->fpu_active) -		return; -	vcpu->fpu_active = 0; -	vmcs_set_bits(GUEST_CR0, X86_CR0_TS); +	vmx_decache_cr0_guest_bits(vcpu); +	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);  	update_exception_bitmap(vcpu); +	vcpu->arch.cr0_guest_owned_bits = 0; +	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); +	vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);  }  static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,  	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);  } +static bool vmx_rdtscp_supported(void) +{ +	return cpu_has_vmx_rdtscp(); +} +  /*   * Swap MSR entry in host/guest MSR entry array.   */ @@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)  		index = __find_msr_index(vmx, MSR_CSTAR);  		if (index >= 0)  			move_msr_up(vmx, index, save_nmsrs++); +		index = __find_msr_index(vmx, MSR_TSC_AUX); +		if (index >= 0 && vmx->rdtscp_enabled) +			move_msr_up(vmx, index, save_nmsrs++);  		/*  		 * MSR_K6_STAR is only needed on long mode guests, and only  		 * if efer.sce is enabled.  		 */  		index = __find_msr_index(vmx, MSR_K6_STAR); -		if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) +		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))  			move_msr_up(vmx, index, save_nmsrs++);  	}  #endif @@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)  	case MSR_IA32_SYSENTER_ESP:  		data = vmcs_readl(GUEST_SYSENTER_ESP);  		break; +	case MSR_TSC_AUX: +		if (!to_vmx(vcpu)->rdtscp_enabled) +			return 1; +		/* Otherwise falls through */  	default:  		vmx_load_host_state(to_vmx(vcpu));  		msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)  			vcpu->arch.pat = data;  			break;  		} -		/* Otherwise falls through to kvm_set_msr_common */ +		ret = kvm_set_msr_common(vcpu, msr_index, data); +		break; +	case MSR_TSC_AUX: +		if (!vmx->rdtscp_enabled) +			return 1; +		/* Check reserved bit, higher 32 bits should be zero */ +		if ((data >> 32) != 0) +			return 1; +		/* Otherwise falls through */  	default:  		msr = find_msr_entry(vmx, msr_index);  		if (msr) { @@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)  	      CPU_BASED_USE_IO_BITMAPS |  	      CPU_BASED_MOV_DR_EXITING |  	      CPU_BASED_USE_TSC_OFFSETING | +	      CPU_BASED_MWAIT_EXITING | +	      CPU_BASED_MONITOR_EXITING |  	      CPU_BASED_INVLPG_EXITING;  	opt = CPU_BASED_TPR_SHADOW |  	      CPU_BASED_USE_MSR_BITMAPS | @@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)  			SECONDARY_EXEC_ENABLE_VPID |  			SECONDARY_EXEC_ENABLE_EPT |  			SECONDARY_EXEC_UNRESTRICTED_GUEST | -			SECONDARY_EXEC_PAUSE_LOOP_EXITING; +			SECONDARY_EXEC_PAUSE_LOOP_EXITING | +			SECONDARY_EXEC_RDTSCP;  		if (adjust_vmx_controls(min2, opt2,  					MSR_IA32_VMX_PROCBASED_CTLS2,  					&_cpu_based_2nd_exec_control) < 0) @@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)  static gva_t rmode_tss_base(struct kvm *kvm)  {  	if (!kvm->arch.tss_addr) { -		gfn_t base_gfn = kvm->memslots[0].base_gfn + -				 kvm->memslots[0].npages - 3; +		struct kvm_memslots *slots; +		gfn_t base_gfn; + +		slots = rcu_dereference(kvm->memslots); +		base_gfn = kvm->memslots->memslots[0].base_gfn + +				 kvm->memslots->memslots[0].npages - 3;  		return base_gfn << PAGE_SHIFT;  	}  	return kvm->arch.tss_addr; @@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)  	 * of this msr depends on is_long_mode().  	 */  	vmx_load_host_state(to_vmx(vcpu)); -	vcpu->arch.shadow_efer = efer; -	if (!msr) -		return; +	vcpu->arch.efer = efer;  	if (efer & EFER_LMA) {  		vmcs_write32(VM_ENTRY_CONTROLS,  			     vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)  			     (guest_tr_ar & ~AR_TYPE_MASK)  			     | AR_TYPE_BUSY_64_TSS);  	} -	vcpu->arch.shadow_efer |= EFER_LMA; -	vmx_set_efer(vcpu, vcpu->arch.shadow_efer); +	vcpu->arch.efer |= EFER_LMA; +	vmx_set_efer(vcpu, vcpu->arch.efer);  }  static void exit_lmode(struct kvm_vcpu *vcpu)  { -	vcpu->arch.shadow_efer &= ~EFER_LMA; +	vcpu->arch.efer &= ~EFER_LMA;  	vmcs_write32(VM_ENTRY_CONTROLS,  		     vmcs_read32(VM_ENTRY_CONTROLS) @@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)  		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));  } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + +	vcpu->arch.cr0 &= ~cr0_guest_owned_bits; +	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} +  static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)  { -	vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; -	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; +	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + +	vcpu->arch.cr4 &= ~cr4_guest_owned_bits; +	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;  }  static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,  			     (CPU_BASED_CR3_LOAD_EXITING |  			      CPU_BASED_CR3_STORE_EXITING));  		vcpu->arch.cr0 = cr0; -		vmx_set_cr4(vcpu, vcpu->arch.cr4); +		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));  	} else if (!is_paging(vcpu)) {  		/* From nonpaging to paging */  		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,  			     ~(CPU_BASED_CR3_LOAD_EXITING |  			       CPU_BASED_CR3_STORE_EXITING));  		vcpu->arch.cr0 = cr0; -		vmx_set_cr4(vcpu, vcpu->arch.cr4); +		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));  	}  	if (!(cr0 & X86_CR0_WP))  		*hw_cr0 &= ~X86_CR0_WP;  } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, -					struct kvm_vcpu *vcpu) -{ -	if (!is_paging(vcpu)) { -		*hw_cr4 &= ~X86_CR4_PAE; -		*hw_cr4 |= X86_CR4_PSE; -	} else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) -		*hw_cr4 &= ~X86_CR4_PAE; -} -  static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  	else  		hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; -	vmx_fpu_deactivate(vcpu); -  	if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))  		enter_pmode(vcpu); @@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  		enter_rmode(vcpu);  #ifdef CONFIG_X86_64 -	if (vcpu->arch.shadow_efer & EFER_LME) { +	if (vcpu->arch.efer & EFER_LME) {  		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))  			enter_lmode(vcpu);  		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  	if (enable_ept)  		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); +	if (!vcpu->fpu_active) +		hw_cr0 |= X86_CR0_TS | X86_CR0_MP; +  	vmcs_writel(CR0_READ_SHADOW, cr0);  	vmcs_writel(GUEST_CR0, hw_cr0);  	vcpu->arch.cr0 = cr0; - -	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) -		vmx_fpu_activate(vcpu);  }  static u64 construct_eptp(unsigned long root_hpa) @@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)  	vmx_flush_tlb(vcpu);  	vmcs_writel(GUEST_CR3, guest_cr3); -	if (vcpu->arch.cr0 & X86_CR0_PE) -		vmx_fpu_deactivate(vcpu);  }  static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);  	vcpu->arch.cr4 = cr4; -	if (enable_ept) -		ept_update_paging_mode_cr4(&hw_cr4, vcpu); +	if (enable_ept) { +		if (!is_paging(vcpu)) { +			hw_cr4 &= ~X86_CR4_PAE; +			hw_cr4 |= X86_CR4_PSE; +		} else if (!(cr4 & X86_CR4_PAE)) { +			hw_cr4 &= ~X86_CR4_PAE; +		} +	}  	vmcs_writel(CR4_READ_SHADOW, cr4);  	vmcs_writel(GUEST_CR4, hw_cr4); @@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,  static int vmx_get_cpl(struct kvm_vcpu *vcpu)  { -	if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ +	if (!is_protmode(vcpu))  		return 0;  	if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)  static bool guest_state_valid(struct kvm_vcpu *vcpu)  {  	/* real mode guest state checks */ -	if (!(vcpu->arch.cr0 & X86_CR0_PE)) { +	if (!is_protmode(vcpu)) {  		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))  			return false;  		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm)  	struct kvm_userspace_memory_region kvm_userspace_mem;  	int r = 0; -	down_write(&kvm->slots_lock); +	mutex_lock(&kvm->slots_lock);  	if (kvm->arch.apic_access_page)  		goto out;  	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm)  	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);  out: -	up_write(&kvm->slots_lock); +	mutex_unlock(&kvm->slots_lock);  	return r;  } @@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)  	struct kvm_userspace_memory_region kvm_userspace_mem;  	int r = 0; -	down_write(&kvm->slots_lock); +	mutex_lock(&kvm->slots_lock);  	if (kvm->arch.ept_identity_pagetable)  		goto out;  	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)  	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,  			kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);  out: -	up_write(&kvm->slots_lock); +	mutex_unlock(&kvm->slots_lock);  	return r;  } @@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  	for (i = 0; i < NR_VMX_MSR; ++i) {  		u32 index = vmx_msr_index[i];  		u32 data_low, data_high; -		u64 data;  		int j = vmx->nmsrs;  		if (rdmsr_safe(index, &data_low, &data_high) < 0)  			continue;  		if (wrmsr_safe(index, data_low, data_high) < 0)  			continue; -		data = data_low | ((u64)data_high << 32);  		vmx->guest_msrs[j].index = i;  		vmx->guest_msrs[j].data = 0;  		vmx->guest_msrs[j].mask = -1ull; @@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);  	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); -	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); +	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; +	if (enable_ept) +		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; +	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);  	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;  	rdtscll(tsc_this); @@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	u64 msr; -	int ret; +	int ret, idx;  	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); -	down_read(&vcpu->kvm->slots_lock); +	idx = srcu_read_lock(&vcpu->kvm->srcu);  	if (!init_rmode(vmx->vcpu.kvm)) {  		ret = -ENOMEM;  		goto out; @@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);  	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; -	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ +	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */  	vmx_set_cr4(&vmx->vcpu, 0);  	vmx_set_efer(&vmx->vcpu, 0);  	vmx_fpu_activate(&vmx->vcpu); @@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  	vmx->emulation_required = 0;  out: -	up_read(&vcpu->kvm->slots_lock); +	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	return ret;  } @@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,  		kvm_queue_exception(vcpu, vec);  		return 1;  	case BP_VECTOR: +		/* +		 * Update instruction length as we may reinject the exception +		 * from user space while in guest debugging mode. +		 */ +		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = +			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);  		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)  			return 0;  		/* fall through */ @@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)  		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);  		/* fall through */  	case BP_VECTOR: +		/* +		 * Update instruction length as we may reinject #BP from +		 * user space while in guest debugging mode. Reading it for +		 * #DB as well causes no harm, it is not used in that case. +		 */ +		vmx->vcpu.arch.event_exit_inst_len = +			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);  		kvm_run->exit_reason = KVM_EXIT_DEBUG;  		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;  		kvm_run->debug.arch.exception = ex_no; @@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu)  		};  		break;  	case 2: /* clts */ -		vmx_fpu_deactivate(vcpu); -		vcpu->arch.cr0 &= ~X86_CR0_TS; -		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); -		vmx_fpu_activate(vcpu); +		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));  		skip_emulated_instruction(vcpu); +		vmx_fpu_activate(vcpu);  		return 1;  	case 1: /*mov from cr*/  		switch (cr) { @@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)  		}  		break;  	case 3: /* lmsw */ -		kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); +		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; +		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); +		kvm_lmsw(vcpu, val);  		skip_emulated_instruction(vcpu);  		return 1; @@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)  	return 0;  } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ +	if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { +		kvm_queue_exception(vcpu, UD_VECTOR); +		return -1; +	} +	return 0; +} +  static int handle_dr(struct kvm_vcpu *vcpu)  {  	unsigned long exit_qualification;  	unsigned long val;  	int dr, reg; +	/* Do not handle if the CPL > 0, will trigger GP on re-entry */  	if (!kvm_require_cpl(vcpu, 0))  		return 1;  	dr = vmcs_readl(GUEST_DR7); @@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)  		case 0 ... 3:  			val = vcpu->arch.db[dr];  			break; +		case 4: +			if (check_dr_alias(vcpu) < 0) +				return 1; +			/* fall through */  		case 6:  			val = vcpu->arch.dr6;  			break; -		case 7: +		case 5: +			if (check_dr_alias(vcpu) < 0) +				return 1; +			/* fall through */ +		default: /* 7 */  			val = vcpu->arch.dr7;  			break; -		default: -			val = 0;  		}  		kvm_register_write(vcpu, reg, val);  	} else { @@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu)  			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))  				vcpu->arch.eff_db[dr] = val;  			break; -		case 4 ... 5: -			if (vcpu->arch.cr4 & X86_CR4_DE) -				kvm_queue_exception(vcpu, UD_VECTOR); -			break; +		case 4: +			if (check_dr_alias(vcpu) < 0) +				return 1; +			/* fall through */  		case 6:  			if (val & 0xffffffff00000000ULL) { -				kvm_queue_exception(vcpu, GP_VECTOR); -				break; +				kvm_inject_gp(vcpu, 0); +				return 1;  			}  			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;  			break; -		case 7: +		case 5: +			if (check_dr_alias(vcpu) < 0) +				return 1; +			/* fall through */ +		default: /* 7 */  			if (val & 0xffffffff00000000ULL) { -				kvm_queue_exception(vcpu, GP_VECTOR); -				break; +				kvm_inject_gp(vcpu, 0); +				return 1;  			}  			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;  			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { @@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)  	u64 data;  	if (vmx_get_msr(vcpu, ecx, &data)) { +		trace_kvm_msr_read_ex(ecx);  		kvm_inject_gp(vcpu, 0);  		return 1;  	} @@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)  	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)  		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); -	trace_kvm_msr_write(ecx, data); -  	if (vmx_set_msr(vcpu, ecx, data) != 0) { +		trace_kvm_msr_write_ex(ecx, data);  		kvm_inject_gp(vcpu, 0);  		return 1;  	} +	trace_kvm_msr_write(ecx, data);  	skip_emulated_instruction(vcpu);  	return 1;  } @@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  		}  		if (err != EMULATE_DONE) { -			kvm_report_emulation_failure(vcpu, "emulation failure");  			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;  			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;  			vcpu->run->internal.ndata = 0; @@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu)  	return 1;  } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ +	kvm_queue_exception(vcpu, UD_VECTOR); +	return 1; +} +  /*   * The exit handlers return 1 if the exit was handled fully and guest execution   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {  	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,  	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,  	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause, +	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op, +	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,  };  static const int kvm_vmx_max_exit_handlers = @@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)  	 */  	vmcs_writel(HOST_CR0, read_cr0()); -	if (vcpu->arch.switch_db_regs) -		set_debugreg(vcpu->arch.dr6, 6); -  	asm(  		/* Store host registers */  		"push %%"R"dx; push %%"R"bp;" @@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)  				  | (1 << VCPU_EXREG_PDPTR));  	vcpu->arch.regs_dirty = 0; -	if (vcpu->arch.switch_db_regs) -		get_debugreg(vcpu->arch.dr6, 6); -  	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);  	if (vmx->rmode.irq.pending)  		fixup_rmode_irq(vmx); @@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)  	 *   b. VT-d with snooping control feature: snooping control feature of  	 *	VT-d engine can guarantee the cache correctness. Just set it  	 *	to WB to keep consistent with host. So the same as item 3. -	 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep +	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep  	 *    consistent with host MTRR  	 */  	if (is_mmio) @@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)  		      VMX_EPT_MT_EPTE_SHIFT;  	else  		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) -			| VMX_EPT_IGMT_BIT; +			| VMX_EPT_IPAT_BIT;  	return ret;  } +#define _ER(x) { EXIT_REASON_##x, #x } +  static const struct trace_print_flags vmx_exit_reasons_str[] = { -	{ EXIT_REASON_EXCEPTION_NMI,           "exception" }, -	{ EXIT_REASON_EXTERNAL_INTERRUPT,      "ext_irq" }, -	{ EXIT_REASON_TRIPLE_FAULT,            "triple_fault" }, -	{ EXIT_REASON_NMI_WINDOW,              "nmi_window" }, -	{ EXIT_REASON_IO_INSTRUCTION,          "io_instruction" }, -	{ EXIT_REASON_CR_ACCESS,               "cr_access" }, -	{ EXIT_REASON_DR_ACCESS,               "dr_access" }, -	{ EXIT_REASON_CPUID,                   "cpuid" }, -	{ EXIT_REASON_MSR_READ,                "rdmsr" }, -	{ EXIT_REASON_MSR_WRITE,               "wrmsr" }, -	{ EXIT_REASON_PENDING_INTERRUPT,       "interrupt_window" }, -	{ EXIT_REASON_HLT,                     "halt" }, -	{ EXIT_REASON_INVLPG,                  "invlpg" }, -	{ EXIT_REASON_VMCALL,                  "hypercall" }, -	{ EXIT_REASON_TPR_BELOW_THRESHOLD,     "tpr_below_thres" }, -	{ EXIT_REASON_APIC_ACCESS,             "apic_access" }, -	{ EXIT_REASON_WBINVD,                  "wbinvd" }, -	{ EXIT_REASON_TASK_SWITCH,             "task_switch" }, -	{ EXIT_REASON_EPT_VIOLATION,           "ept_violation" }, +	_ER(EXCEPTION_NMI), +	_ER(EXTERNAL_INTERRUPT), +	_ER(TRIPLE_FAULT), +	_ER(PENDING_INTERRUPT), +	_ER(NMI_WINDOW), +	_ER(TASK_SWITCH), +	_ER(CPUID), +	_ER(HLT), +	_ER(INVLPG), +	_ER(RDPMC), +	_ER(RDTSC), +	_ER(VMCALL), +	_ER(VMCLEAR), +	_ER(VMLAUNCH), +	_ER(VMPTRLD), +	_ER(VMPTRST), +	_ER(VMREAD), +	_ER(VMRESUME), +	_ER(VMWRITE), +	_ER(VMOFF), +	_ER(VMON), +	_ER(CR_ACCESS), +	_ER(DR_ACCESS), +	_ER(IO_INSTRUCTION), +	_ER(MSR_READ), +	_ER(MSR_WRITE), +	_ER(MWAIT_INSTRUCTION), +	_ER(MONITOR_INSTRUCTION), +	_ER(PAUSE_INSTRUCTION), +	_ER(MCE_DURING_VMENTRY), +	_ER(TPR_BELOW_THRESHOLD), +	_ER(APIC_ACCESS), +	_ER(EPT_VIOLATION), +	_ER(EPT_MISCONFIG), +	_ER(WBINVD),  	{ -1, NULL }  }; -static bool vmx_gb_page_enable(void) +#undef _ER + +static int vmx_get_lpage_level(void) +{ +	if (enable_ept && !cpu_has_vmx_ept_1g_page()) +		return PT_DIRECTORY_LEVEL; +	else +		/* For shadow and EPT supported 1GB page */ +		return PT_PDPE_LEVEL; +} + +static inline u32 bit(int bitno) +{ +	return 1 << (bitno & 31); +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu)  { -	return false; +	struct kvm_cpuid_entry2 *best; +	struct vcpu_vmx *vmx = to_vmx(vcpu); +	u32 exec_control; + +	vmx->rdtscp_enabled = false; +	if (vmx_rdtscp_supported()) { +		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); +		if (exec_control & SECONDARY_EXEC_RDTSCP) { +			best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); +			if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) +				vmx->rdtscp_enabled = true; +			else { +				exec_control &= ~SECONDARY_EXEC_RDTSCP; +				vmcs_write32(SECONDARY_VM_EXEC_CONTROL, +						exec_control); +			} +		} +	}  }  static struct kvm_x86_ops vmx_x86_ops = { @@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.set_segment = vmx_set_segment,  	.get_cpl = vmx_get_cpl,  	.get_cs_db_l_bits = vmx_get_cs_db_l_bits, +	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,  	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,  	.set_cr0 = vmx_set_cr0,  	.set_cr3 = vmx_set_cr3, @@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.cache_reg = vmx_cache_reg,  	.get_rflags = vmx_get_rflags,  	.set_rflags = vmx_set_rflags, +	.fpu_activate = vmx_fpu_activate, +	.fpu_deactivate = vmx_fpu_deactivate,  	.tlb_flush = vmx_flush_tlb, @@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.get_mt_mask = vmx_get_mt_mask,  	.exit_reasons_str = vmx_exit_reasons_str, -	.gb_page_enable = vmx_gb_page_enable, +	.get_lpage_level = vmx_get_lpage_level, + +	.cpuid_update = vmx_cpuid_update, + +	.rdtscp_supported = vmx_rdtscp_supported,  };  static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9d412d..e46282a56565 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,7 @@  #include <linux/intel-iommu.h>  #include <linux/cpufreq.h>  #include <linux/user-return-notifier.h> +#include <linux/srcu.h>  #include <trace/events/kvm.h>  #undef TRACE_INCLUDE_FILE  #define CREATE_TRACE_POINTS @@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);  struct kvm_shared_msrs_global {  	int nr; -	struct kvm_shared_msr { -		u32 msr; -		u64 value; -	} msrs[KVM_NR_SHARED_MSRS]; +	u32 msrs[KVM_NR_SHARED_MSRS];  };  struct kvm_shared_msrs {  	struct user_return_notifier urn;  	bool registered; -	u64 current_value[KVM_NR_SHARED_MSRS]; +	struct kvm_shared_msr_values { +		u64 host; +		u64 curr; +	} values[KVM_NR_SHARED_MSRS];  };  static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {  static void kvm_on_user_return(struct user_return_notifier *urn)  {  	unsigned slot; -	struct kvm_shared_msr *global;  	struct kvm_shared_msrs *locals  		= container_of(urn, struct kvm_shared_msrs, urn); +	struct kvm_shared_msr_values *values;  	for (slot = 0; slot < shared_msrs_global.nr; ++slot) { -		global = &shared_msrs_global.msrs[slot]; -		if (global->value != locals->current_value[slot]) { -			wrmsrl(global->msr, global->value); -			locals->current_value[slot] = global->value; +		values = &locals->values[slot]; +		if (values->host != values->curr) { +			wrmsrl(shared_msrs_global.msrs[slot], values->host); +			values->curr = values->host;  		}  	}  	locals->registered = false;  	user_return_notifier_unregister(urn);  } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr)  { -	int cpu; +	struct kvm_shared_msrs *smsr;  	u64 value; +	smsr = &__get_cpu_var(shared_msrs); +	/* only read, and nobody should modify it at this time, +	 * so don't need lock */ +	if (slot >= shared_msrs_global.nr) { +		printk(KERN_ERR "kvm: invalid MSR slot!"); +		return; +	} +	rdmsrl_safe(msr, &value); +	smsr->values[slot].host = value; +	smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{  	if (slot >= shared_msrs_global.nr)  		shared_msrs_global.nr = slot + 1; -	shared_msrs_global.msrs[slot].msr = msr; -	rdmsrl_safe(msr, &value); -	shared_msrs_global.msrs[slot].value = value; -	for_each_online_cpu(cpu) -		per_cpu(shared_msrs, cpu).current_value[slot] = value; +	shared_msrs_global.msrs[slot] = msr; +	/* we need ensured the shared_msr_global have been updated */ +	smp_wmb();  }  EXPORT_SYMBOL_GPL(kvm_define_shared_msr);  static void kvm_shared_msr_cpu_online(void)  {  	unsigned i; -	struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);  	for (i = 0; i < shared_msrs_global.nr; ++i) -		locals->current_value[i] = shared_msrs_global.msrs[i].value; +		shared_msr_update(i, shared_msrs_global.msrs[i]);  }  void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)  {  	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); -	if (((value ^ smsr->current_value[slot]) & mask) == 0) +	if (((value ^ smsr->values[slot].curr) & mask) == 0)  		return; -	smsr->current_value[slot] = value; -	wrmsrl(shared_msrs_global.msrs[slot].msr, value); +	smsr->values[slot].curr = value; +	wrmsrl(shared_msrs_global.msrs[slot], value);  	if (!smsr->registered) {  		smsr->urn.on_user_return = kvm_on_user_return;  		user_return_notifier_register(&smsr->urn); @@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)  }  EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN		0 +#define EXCPT_CONTRIBUTORY	1 +#define EXCPT_PF		2 + +static int exception_class(int vector) +{ +	switch (vector) { +	case PF_VECTOR: +		return EXCPT_PF; +	case DE_VECTOR: +	case TS_VECTOR: +	case NP_VECTOR: +	case SS_VECTOR: +	case GP_VECTOR: +		return EXCPT_CONTRIBUTORY; +	default: +		break; +	} +	return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, +		unsigned nr, bool has_error, u32 error_code) +{ +	u32 prev_nr; +	int class1, class2; + +	if (!vcpu->arch.exception.pending) { +	queue: +		vcpu->arch.exception.pending = true; +		vcpu->arch.exception.has_error_code = has_error; +		vcpu->arch.exception.nr = nr; +		vcpu->arch.exception.error_code = error_code; +		return; +	} + +	/* to check exception */ +	prev_nr = vcpu->arch.exception.nr; +	if (prev_nr == DF_VECTOR) { +		/* triple fault -> shutdown */ +		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); +		return; +	} +	class1 = exception_class(prev_nr); +	class2 = exception_class(nr); +	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) +		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { +		/* generate double fault per SDM Table 5-5 */ +		vcpu->arch.exception.pending = true; +		vcpu->arch.exception.has_error_code = true; +		vcpu->arch.exception.nr = DF_VECTOR; +		vcpu->arch.exception.error_code = 0; +	} else +		/* replace previous exception with a new one in a hope +		   that instruction re-execution will regenerate lost +		   exception */ +		goto queue; +} +  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)  { -	WARN_ON(vcpu->arch.exception.pending); -	vcpu->arch.exception.pending = true; -	vcpu->arch.exception.has_error_code = false; -	vcpu->arch.exception.nr = nr; +	kvm_multiple_exception(vcpu, nr, false, 0);  }  EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,  			   u32 error_code)  {  	++vcpu->stat.pf_guest; - -	if (vcpu->arch.exception.pending) { -		switch(vcpu->arch.exception.nr) { -		case DF_VECTOR: -			/* triple fault -> shutdown */ -			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); -			return; -		case PF_VECTOR: -			vcpu->arch.exception.nr = DF_VECTOR; -			vcpu->arch.exception.error_code = 0; -			return; -		default: -			/* replace previous exception with a new one in a hope -			   that instruction re-execution will regenerate lost -			   exception */ -			vcpu->arch.exception.pending = false; -			break; -		} -	}  	vcpu->arch.cr2 = addr;  	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);  } @@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);  void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)  { -	WARN_ON(vcpu->arch.exception.pending); -	vcpu->arch.exception.pending = true; -	vcpu->arch.exception.has_error_code = true; -	vcpu->arch.exception.nr = nr; -	vcpu->arch.exception.error_code = error_code; +	kvm_multiple_exception(vcpu, nr, true, error_code);  }  EXPORT_SYMBOL_GPL(kvm_queue_exception_e); @@ -383,12 +428,18 @@ out:  void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  { -	if (cr0 & CR0_RESERVED_BITS) { +	cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 +	if (cr0 & 0xffffffff00000000UL) {  		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", -		       cr0, vcpu->arch.cr0); +		       cr0, kvm_read_cr0(vcpu));  		kvm_inject_gp(vcpu, 0);  		return;  	} +#endif + +	cr0 &= ~CR0_RESERVED_BITS;  	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {  		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); @@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {  #ifdef CONFIG_X86_64 -		if ((vcpu->arch.shadow_efer & EFER_LME)) { +		if ((vcpu->arch.efer & EFER_LME)) {  			int cs_db, cs_l;  			if (!is_pae(vcpu)) { @@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)  { -	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); +	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));  }  EXPORT_SYMBOL_GPL(kvm_lmsw);  void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  { -	unsigned long old_cr4 = vcpu->arch.cr4; +	unsigned long old_cr4 = kvm_read_cr4(vcpu);  	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;  	if (cr4 & CR4_RESERVED_BITS) { @@ -575,9 +626,11 @@ static inline u32 bit(int bitno)   * kvm-specific. Those are put in the beginning of the list.   */ -#define KVM_SAVE_MSRS_BEGIN	2 +#define KVM_SAVE_MSRS_BEGIN	5  static u32 msrs_to_save[] = {  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, +	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, +	HV_X64_MSR_APIC_ASSIST_PAGE,  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,  	MSR_K6_STAR,  #ifdef CONFIG_X86_64 @@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)  	}  	if (is_paging(vcpu) -	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { +	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {  		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");  		kvm_inject_gp(vcpu, 0);  		return; @@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)  	kvm_x86_ops->set_efer(vcpu, efer);  	efer &= ~EFER_LMA; -	efer |= vcpu->arch.shadow_efer & EFER_LMA; +	efer |= vcpu->arch.efer & EFER_LMA; -	vcpu->arch.shadow_efer = efer; +	vcpu->arch.efer = efer;  	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;  	kvm_mmu_reset_context(vcpu); @@ -957,6 +1010,100 @@ out:  	return r;  } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ +	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ +	bool r = false; +	switch (msr) { +	case HV_X64_MSR_GUEST_OS_ID: +	case HV_X64_MSR_HYPERCALL: +		r = true; +		break; +	} + +	return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ +	struct kvm *kvm = vcpu->kvm; + +	switch (msr) { +	case HV_X64_MSR_GUEST_OS_ID: +		kvm->arch.hv_guest_os_id = data; +		/* setting guest os id to zero disables hypercall page */ +		if (!kvm->arch.hv_guest_os_id) +			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; +		break; +	case HV_X64_MSR_HYPERCALL: { +		u64 gfn; +		unsigned long addr; +		u8 instructions[4]; + +		/* if guest os id is not set hypercall should remain disabled */ +		if (!kvm->arch.hv_guest_os_id) +			break; +		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { +			kvm->arch.hv_hypercall = data; +			break; +		} +		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; +		addr = gfn_to_hva(kvm, gfn); +		if (kvm_is_error_hva(addr)) +			return 1; +		kvm_x86_ops->patch_hypercall(vcpu, instructions); +		((unsigned char *)instructions)[3] = 0xc3; /* ret */ +		if (copy_to_user((void __user *)addr, instructions, 4)) +			return 1; +		kvm->arch.hv_hypercall = data; +		break; +	} +	default: +		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " +			  "data 0x%llx\n", msr, data); +		return 1; +	} +	return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ +	switch (msr) { +	case HV_X64_MSR_APIC_ASSIST_PAGE: { +		unsigned long addr; + +		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { +			vcpu->arch.hv_vapic = data; +			break; +		} +		addr = gfn_to_hva(vcpu->kvm, data >> +				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); +		if (kvm_is_error_hva(addr)) +			return 1; +		if (clear_user((void __user *)addr, PAGE_SIZE)) +			return 1; +		vcpu->arch.hv_vapic = data; +		break; +	} +	case HV_X64_MSR_EOI: +		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); +	case HV_X64_MSR_ICR: +		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); +	case HV_X64_MSR_TPR: +		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); +	default: +		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " +			  "data 0x%llx\n", msr, data); +		return 1; +	} + +	return 0; +} +  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  {  	switch (msr) { @@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "  			"0x%x data 0x%llx\n", msr, data);  		break; +	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: +		if (kvm_hv_msr_partition_wide(msr)) { +			int r; +			mutex_lock(&vcpu->kvm->lock); +			r = set_msr_hyperv_pw(vcpu, msr, data); +			mutex_unlock(&vcpu->kvm->lock); +			return r; +		} else +			return set_msr_hyperv(vcpu, msr, data); +		break;  	default:  		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))  			return xen_hvm_config(vcpu, data); @@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  	return 0;  } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ +	u64 data = 0; +	struct kvm *kvm = vcpu->kvm; + +	switch (msr) { +	case HV_X64_MSR_GUEST_OS_ID: +		data = kvm->arch.hv_guest_os_id; +		break; +	case HV_X64_MSR_HYPERCALL: +		data = kvm->arch.hv_hypercall; +		break; +	default: +		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); +		return 1; +	} + +	*pdata = data; +	return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ +	u64 data = 0; + +	switch (msr) { +	case HV_X64_MSR_VP_INDEX: { +		int r; +		struct kvm_vcpu *v; +		kvm_for_each_vcpu(r, v, vcpu->kvm) +			if (v == vcpu) +				data = r; +		break; +	} +	case HV_X64_MSR_EOI: +		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); +	case HV_X64_MSR_ICR: +		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); +	case HV_X64_MSR_TPR: +		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); +	default: +		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); +		return 1; +	} +	*pdata = data; +	return 0; +} +  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  {  	u64 data; @@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  		data |= (((uint64_t)4ULL) << 40);  		break;  	case MSR_EFER: -		data = vcpu->arch.shadow_efer; +		data = vcpu->arch.efer;  		break;  	case MSR_KVM_WALL_CLOCK:  		data = vcpu->kvm->arch.wall_clock; @@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  	case MSR_IA32_MCG_STATUS:  	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:  		return get_msr_mce(vcpu, msr, pdata); +	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: +		if (kvm_hv_msr_partition_wide(msr)) { +			int r; +			mutex_lock(&vcpu->kvm->lock); +			r = get_msr_hyperv_pw(vcpu, msr, pdata); +			mutex_unlock(&vcpu->kvm->lock); +			return r; +		} else +			return get_msr_hyperv(vcpu, msr, pdata); +		break;  	default:  		if (!ignore_msrs) {  			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,  		    int (*do_msr)(struct kvm_vcpu *vcpu,  				  unsigned index, u64 *data))  { -	int i; +	int i, idx;  	vcpu_load(vcpu); -	down_read(&vcpu->kvm->slots_lock); +	idx = srcu_read_lock(&vcpu->kvm->srcu);  	for (i = 0; i < msrs->nmsrs; ++i)  		if (do_msr(vcpu, entries[i].index, &entries[i].data))  			break; -	up_read(&vcpu->kvm->slots_lock); +	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	vcpu_put(vcpu); @@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext)  	case KVM_CAP_XEN_HVM:  	case KVM_CAP_ADJUST_CLOCK:  	case KVM_CAP_VCPU_EVENTS: +	case KVM_CAP_HYPERV: +	case KVM_CAP_HYPERV_VAPIC: +	case KVM_CAP_HYPERV_SPIN: +	case KVM_CAP_PCI_SEGMENT: +	case KVM_CAP_X86_ROBUST_SINGLESTEP:  		r = 1;  		break;  	case KVM_CAP_COALESCED_MMIO: @@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)  { -	kvm_x86_ops->vcpu_put(vcpu);  	kvm_put_guest_fpu(vcpu); +	kvm_x86_ops->vcpu_put(vcpu);  }  static int is_efer_nx(void) @@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,  	cpuid_fix_nx_cap(vcpu);  	r = 0;  	kvm_apic_set_version(vcpu); +	kvm_x86_ops->cpuid_update(vcpu);  out_free:  	vfree(cpuid_entries); @@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,  		goto out;  	vcpu->arch.cpuid_nent = cpuid->nent;  	kvm_apic_set_version(vcpu); +	kvm_x86_ops->cpuid_update(vcpu);  	return 0;  out: @@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  			 u32 index, int *nent, int maxnent)  {  	unsigned f_nx = is_efer_nx() ? F(NX) : 0; -	unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;  #ifdef CONFIG_X86_64 +	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) +				? F(GBPAGES) : 0;  	unsigned f_lm = F(LM);  #else +	unsigned f_gbpages = 0;  	unsigned f_lm = 0;  #endif +	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;  	/* cpuid 1.edx */  	const u32 kvm_supported_word0_x86_features = @@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |  		F(PAT) | F(PSE36) | 0 /* Reserved */ |  		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | -		F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | +		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |  		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);  	/* cpuid 1.ecx */  	const u32 kvm_supported_word4_x86_features = @@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,  		return 0;  	if (mce->status & MCI_STATUS_UC) {  		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || -		    !(vcpu->arch.cr4 & X86_CR4_MCE)) { +		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {  			printk(KERN_DEBUG "kvm: set_mce: "  			       "injects mce exception while "  			       "previous one is in progress!\n"); @@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,  	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)  		return -EINVAL; -	down_write(&kvm->slots_lock); +	mutex_lock(&kvm->slots_lock);  	spin_lock(&kvm->mmu_lock);  	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);  	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;  	spin_unlock(&kvm->mmu_lock); -	up_write(&kvm->slots_lock); +	mutex_unlock(&kvm->slots_lock);  	return 0;  } @@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)  	return kvm->arch.n_alloc_mmu_pages;  } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ +	int i; +	struct kvm_mem_alias *alias; +	struct kvm_mem_aliases *aliases; + +	aliases = rcu_dereference(kvm->arch.aliases); + +	for (i = 0; i < aliases->naliases; ++i) { +		alias = &aliases->aliases[i]; +		if (alias->flags & KVM_ALIAS_INVALID) +			continue; +		if (gfn >= alias->base_gfn +		    && gfn < alias->base_gfn + alias->npages) +			return alias->target_gfn + gfn - alias->base_gfn; +	} +	return gfn; +} +  gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)  {  	int i;  	struct kvm_mem_alias *alias; +	struct kvm_mem_aliases *aliases; -	for (i = 0; i < kvm->arch.naliases; ++i) { -		alias = &kvm->arch.aliases[i]; +	aliases = rcu_dereference(kvm->arch.aliases); + +	for (i = 0; i < aliases->naliases; ++i) { +		alias = &aliases->aliases[i];  		if (gfn >= alias->base_gfn  		    && gfn < alias->base_gfn + alias->npages)  			return alias->target_gfn + gfn - alias->base_gfn; @@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,  {  	int r, n;  	struct kvm_mem_alias *p; +	struct kvm_mem_aliases *aliases, *old_aliases;  	r = -EINVAL;  	/* General sanity checks */ @@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,  	    < alias->target_phys_addr)  		goto out; -	down_write(&kvm->slots_lock); -	spin_lock(&kvm->mmu_lock); +	r = -ENOMEM; +	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); +	if (!aliases) +		goto out; + +	mutex_lock(&kvm->slots_lock); -	p = &kvm->arch.aliases[alias->slot]; +	/* invalidate any gfn reference in case of deletion/shrinking */ +	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); +	aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; +	old_aliases = kvm->arch.aliases; +	rcu_assign_pointer(kvm->arch.aliases, aliases); +	synchronize_srcu_expedited(&kvm->srcu); +	kvm_mmu_zap_all(kvm); +	kfree(old_aliases); + +	r = -ENOMEM; +	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); +	if (!aliases) +		goto out_unlock; + +	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + +	p = &aliases->aliases[alias->slot];  	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;  	p->npages = alias->memory_size >> PAGE_SHIFT;  	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; +	p->flags &= ~(KVM_ALIAS_INVALID);  	for (n = KVM_ALIAS_SLOTS; n > 0; --n) -		if (kvm->arch.aliases[n - 1].npages) +		if (aliases->aliases[n - 1].npages)  			break; -	kvm->arch.naliases = n; +	aliases->naliases = n; -	spin_unlock(&kvm->mmu_lock); -	kvm_mmu_zap_all(kvm); - -	up_write(&kvm->slots_lock); - -	return 0; +	old_aliases = kvm->arch.aliases; +	rcu_assign_pointer(kvm->arch.aliases, aliases); +	synchronize_srcu_expedited(&kvm->srcu); +	kfree(old_aliases); +	r = 0; +out_unlock: +	mutex_unlock(&kvm->slots_lock);  out:  	return r;  } @@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)  	r = 0;  	switch (chip->chip_id) {  	case KVM_IRQCHIP_PIC_MASTER: -		spin_lock(&pic_irqchip(kvm)->lock); +		raw_spin_lock(&pic_irqchip(kvm)->lock);  		memcpy(&pic_irqchip(kvm)->pics[0],  			&chip->chip.pic,  			sizeof(struct kvm_pic_state)); -		spin_unlock(&pic_irqchip(kvm)->lock); +		raw_spin_unlock(&pic_irqchip(kvm)->lock);  		break;  	case KVM_IRQCHIP_PIC_SLAVE: -		spin_lock(&pic_irqchip(kvm)->lock); +		raw_spin_lock(&pic_irqchip(kvm)->lock);  		memcpy(&pic_irqchip(kvm)->pics[1],  			&chip->chip.pic,  			sizeof(struct kvm_pic_state)); -		spin_unlock(&pic_irqchip(kvm)->lock); +		raw_spin_unlock(&pic_irqchip(kvm)->lock);  		break;  	case KVM_IRQCHIP_IOAPIC:  		r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,  				      struct kvm_dirty_log *log)  { -	int r; -	int n; +	int r, n, i;  	struct kvm_memory_slot *memslot; -	int is_dirty = 0; +	unsigned long is_dirty = 0; +	unsigned long *dirty_bitmap = NULL; -	down_write(&kvm->slots_lock); +	mutex_lock(&kvm->slots_lock); -	r = kvm_get_dirty_log(kvm, log, &is_dirty); -	if (r) +	r = -EINVAL; +	if (log->slot >= KVM_MEMORY_SLOTS) +		goto out; + +	memslot = &kvm->memslots->memslots[log->slot]; +	r = -ENOENT; +	if (!memslot->dirty_bitmap) +		goto out; + +	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + +	r = -ENOMEM; +	dirty_bitmap = vmalloc(n); +	if (!dirty_bitmap)  		goto out; +	memset(dirty_bitmap, 0, n); + +	for (i = 0; !is_dirty && i < n/sizeof(long); i++) +		is_dirty = memslot->dirty_bitmap[i];  	/* If nothing is dirty, don't bother messing with page tables. */  	if (is_dirty) { +		struct kvm_memslots *slots, *old_slots; +  		spin_lock(&kvm->mmu_lock);  		kvm_mmu_slot_remove_write_access(kvm, log->slot);  		spin_unlock(&kvm->mmu_lock); -		memslot = &kvm->memslots[log->slot]; -		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; -		memset(memslot->dirty_bitmap, 0, n); + +		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); +		if (!slots) +			goto out_free; + +		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); +		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + +		old_slots = kvm->memslots; +		rcu_assign_pointer(kvm->memslots, slots); +		synchronize_srcu_expedited(&kvm->srcu); +		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; +		kfree(old_slots);  	} +  	r = 0; +	if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) +		r = -EFAULT; +out_free: +	vfree(dirty_bitmap);  out: -	up_write(&kvm->slots_lock); +	mutex_unlock(&kvm->slots_lock);  	return r;  } @@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp,  		if (vpic) {  			r = kvm_ioapic_init(kvm);  			if (r) { +				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, +							  &vpic->dev);  				kfree(vpic);  				goto create_irqchip_unlock;  			} @@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp,  		r = kvm_setup_default_irq_routing(kvm);  		if (r) {  			mutex_lock(&kvm->irq_lock); -			kfree(kvm->arch.vpic); -			kfree(kvm->arch.vioapic); -			kvm->arch.vpic = NULL; -			kvm->arch.vioapic = NULL; +			kvm_ioapic_destroy(kvm); +			kvm_destroy_pic(kvm);  			mutex_unlock(&kvm->irq_lock);  		}  	create_irqchip_unlock: @@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp,  				   sizeof(struct kvm_pit_config)))  			goto out;  	create_pit: -		down_write(&kvm->slots_lock); +		mutex_lock(&kvm->slots_lock);  		r = -EEXIST;  		if (kvm->arch.vpit)  			goto create_pit_unlock; @@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp,  		if (kvm->arch.vpit)  			r = 0;  	create_pit_unlock: -		up_write(&kvm->slots_lock); +		mutex_unlock(&kvm->slots_lock);  		break;  	case KVM_IRQ_LINE_STATUS:  	case KVM_IRQ_LINE: { @@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,  	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))  		return 0; -	return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); +	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);  }  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)  	    !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))  		return 0; -	return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); +	return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);  } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, -			       struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ +	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; +	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ +	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; +	access |= PFERR_FETCH_MASK; +	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ +	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; +	access |= PFERR_WRITE_MASK; +	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ +	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, +				      struct kvm_vcpu *vcpu, u32 access, +				      u32 *error)  {  	void *data = val;  	int r = X86EMUL_CONTINUE;  	while (bytes) { -		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); +		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);  		unsigned offset = addr & (PAGE_SIZE-1);  		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);  		int ret; @@ -2767,14 +3097,37 @@ out:  	return r;  } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, +				struct kvm_vcpu *vcpu, u32 *error) +{ +	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; +	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, +					  access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, +			       struct kvm_vcpu *vcpu, u32 *error) +{ +	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; +	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, +					  error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, +			       struct kvm_vcpu *vcpu, u32 *error) +{ +	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} +  static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, -				struct kvm_vcpu *vcpu) +				struct kvm_vcpu *vcpu, u32 *error)  {  	void *data = val;  	int r = X86EMUL_CONTINUE;  	while (bytes) { -		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); +		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);  		unsigned offset = addr & (PAGE_SIZE-1);  		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);  		int ret; @@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr,  				  struct kvm_vcpu *vcpu)  {  	gpa_t                 gpa; +	u32 error_code;  	if (vcpu->mmio_read_completed) {  		memcpy(val, vcpu->mmio_data, bytes); @@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr,  		return X86EMUL_CONTINUE;  	} -	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); +	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + +	if (gpa == UNMAPPED_GVA) { +		kvm_inject_page_fault(vcpu, addr, error_code); +		return X86EMUL_PROPAGATE_FAULT; +	}  	/* For APIC access vmexit */  	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)  		goto mmio; -	if (kvm_read_guest_virt(addr, val, bytes, vcpu) +	if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)  				== X86EMUL_CONTINUE)  		return X86EMUL_CONTINUE; -	if (gpa == UNMAPPED_GVA) -		return X86EMUL_PROPAGATE_FAULT;  mmio:  	/* @@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,  					   struct kvm_vcpu *vcpu)  {  	gpa_t                 gpa; +	u32 error_code; -	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); +	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);  	if (gpa == UNMAPPED_GVA) { -		kvm_inject_page_fault(vcpu, addr, 2); +		kvm_inject_page_fault(vcpu, addr, error_code);  		return X86EMUL_PROPAGATE_FAULT;  	} @@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,  		char *kaddr;  		u64 val; -		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); +		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);  		if (gpa == UNMAPPED_GVA ||  		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)  int emulate_clts(struct kvm_vcpu *vcpu)  { -	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); +	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +	kvm_x86_ops->fpu_activate(vcpu);  	return X86EMUL_CONTINUE;  }  int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)  { -	struct kvm_vcpu *vcpu = ctxt->vcpu; - -	switch (dr) { -	case 0 ... 3: -		*dest = kvm_x86_ops->get_dr(vcpu, dr); -		return X86EMUL_CONTINUE; -	default: -		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); -		return X86EMUL_UNHANDLEABLE; -	} +	return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);  }  int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)  {  	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; -	int exception; -	kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); -	if (exception) { -		/* FIXME: better handling */ -		return X86EMUL_UNHANDLEABLE; -	} -	return X86EMUL_CONTINUE; +	return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);  }  void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)  	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); -	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); +	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);  	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",  	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)  EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);  static struct x86_emulate_ops emulate_ops = { -	.read_std            = kvm_read_guest_virt, +	.read_std            = kvm_read_guest_virt_system, +	.fetch               = kvm_fetch_guest_virt,  	.read_emulated       = emulator_read_emulated,  	.write_emulated      = emulator_write_emulated,  	.cmpxchg_emulated    = emulator_cmpxchg_emulated, @@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,  		vcpu->arch.emulate_ctxt.vcpu = vcpu;  		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);  		vcpu->arch.emulate_ctxt.mode = +			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :  			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) -			? X86EMUL_MODE_REAL : cs_l +			? X86EMUL_MODE_VM86 : cs_l  			? X86EMUL_MODE_PROT64 :	cs_db  			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; @@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)  	gva_t q = vcpu->arch.pio.guest_gva;  	unsigned bytes;  	int ret; +	u32 error_code;  	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;  	if (vcpu->arch.pio.in) -		ret = kvm_write_guest_virt(q, p, bytes, vcpu); +		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);  	else -		ret = kvm_read_guest_virt(q, p, bytes, vcpu); +		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + +	if (ret == X86EMUL_PROPAGATE_FAULT) +		kvm_inject_page_fault(vcpu, q, error_code); +  	return ret;  } @@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu)  		if (io->in) {  			r = pio_copy_data(vcpu);  			if (r) -				return r; +				goto out;  		}  		delta = 1; @@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu)  			kvm_register_write(vcpu, VCPU_REGS_RSI, val);  		}  	} - +out:  	io->count -= io->cur_count;  	io->cur_count = 0; @@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)  	int r;  	if (vcpu->arch.pio.in) -		r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, +		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,  				    vcpu->arch.pio.size, pd);  	else -		r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, -				     vcpu->arch.pio.size, pd); +		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, +				     vcpu->arch.pio.port, vcpu->arch.pio.size, +				     pd);  	return r;  } @@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)  	int i, r = 0;  	for (i = 0; i < io->cur_count; i++) { -		if (kvm_io_bus_write(&vcpu->kvm->pio_bus, +		if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,  				     io->port, io->size, pd)) {  			r = -EOPNOTSUPP;  			break; @@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)  {  	unsigned long val; +	trace_kvm_pio(!in, port, size, 1); +  	vcpu->run->exit_reason = KVM_EXIT_IO;  	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;  	vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)  	vcpu->arch.pio.down = 0;  	vcpu->arch.pio.rep = 0; -	trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, -		      size, 1); - -	val = kvm_register_read(vcpu, VCPU_REGS_RAX); -	memcpy(vcpu->arch.pio_data, &val, 4); +	if (!vcpu->arch.pio.in) { +		val = kvm_register_read(vcpu, VCPU_REGS_RAX); +		memcpy(vcpu->arch.pio_data, &val, 4); +	}  	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {  		complete_pio(vcpu); @@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,  	unsigned now, in_page;  	int ret = 0; +	trace_kvm_pio(!in, port, size, count); +  	vcpu->run->exit_reason = KVM_EXIT_IO;  	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;  	vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,  	vcpu->arch.pio.down = down;  	vcpu->arch.pio.rep = rep; -	trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, -		      size, count); -  	if (!count) {  		kvm_x86_ops->skip_emulated_instruction(vcpu);  		return 1; @@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,  	if (!vcpu->arch.pio.in) {  		/* string PIO write */  		ret = pio_copy_data(vcpu); -		if (ret == X86EMUL_PROPAGATE_FAULT) { -			kvm_inject_gp(vcpu, 0); +		if (ret == X86EMUL_PROPAGATE_FAULT)  			return 1; -		}  		if (ret == 0 && !pio_string_write(vcpu)) {  			complete_pio(vcpu);  			if (vcpu->arch.pio.count == 0) @@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,  		return a0 | ((gpa_t)a1 << 32);  } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ +	u64 param, ingpa, outgpa, ret; +	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; +	bool fast, longmode; +	int cs_db, cs_l; + +	/* +	 * hypercall generates UD from non zero cpl and real mode +	 * per HYPER-V spec +	 */ +	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { +		kvm_queue_exception(vcpu, UD_VECTOR); +		return 0; +	} + +	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); +	longmode = is_long_mode(vcpu) && cs_l == 1; + +	if (!longmode) { +		param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | +			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); +		ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | +			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); +		outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | +			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); +	} +#ifdef CONFIG_X86_64 +	else { +		param = kvm_register_read(vcpu, VCPU_REGS_RCX); +		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); +		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); +	} +#endif + +	code = param & 0xffff; +	fast = (param >> 16) & 0x1; +	rep_cnt = (param >> 32) & 0xfff; +	rep_idx = (param >> 48) & 0xfff; + +	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + +	switch (code) { +	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: +		kvm_vcpu_on_spin(vcpu); +		break; +	default: +		res = HV_STATUS_INVALID_HYPERCALL_CODE; +		break; +	} + +	ret = res | (((u64)rep_done & 0xfff) << 32); +	if (longmode) { +		kvm_register_write(vcpu, VCPU_REGS_RAX, ret); +	} else { +		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); +		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); +	} + +	return 1; +} +  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)  {  	unsigned long nr, a0, a1, a2, a3, ret;  	int r = 1; +	if (kvm_hv_hypercall_enabled(vcpu->kvm)) +		return kvm_hv_hypercall(vcpu); +  	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);  	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);  	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);  int kvm_fix_hypercall(struct kvm_vcpu *vcpu)  {  	char instruction[3]; -	int ret = 0;  	unsigned long rip = kvm_rip_read(vcpu); -  	/*  	 * Blow out the MMU to ensure that no other VCPU has an active mapping  	 * to ensure that the updated hypercall appears atomically across all @@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)  	kvm_mmu_zap_all(vcpu->kvm);  	kvm_x86_ops->patch_hypercall(vcpu, instruction); -	if (emulator_write_emulated(rip, instruction, 3, vcpu) -	    != X86EMUL_CONTINUE) -		ret = -EFAULT; -	return ret; +	return emulator_write_emulated(rip, instruction, 3, vcpu);  }  static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)  {  	unsigned long value; -	kvm_x86_ops->decache_cr4_guest_bits(vcpu);  	switch (cr) {  	case 0: -		value = vcpu->arch.cr0; +		value = kvm_read_cr0(vcpu);  		break;  	case 2:  		value = vcpu->arch.cr2; @@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)  		value = vcpu->arch.cr3;  		break;  	case 4: -		value = vcpu->arch.cr4; +		value = kvm_read_cr4(vcpu);  		break;  	case 8:  		value = kvm_get_cr8(vcpu); @@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,  {  	switch (cr) {  	case 0: -		kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); +		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));  		*rflags = kvm_get_rflags(vcpu);  		break;  	case 2: @@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,  		kvm_set_cr3(vcpu, val);  		break;  	case 4: -		kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); +		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));  		break;  	case 8:  		kvm_set_cr8(vcpu, val & 0xfUL); @@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,  	}  	return best;  } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)  { @@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)  static void vapic_exit(struct kvm_vcpu *vcpu)  {  	struct kvm_lapic *apic = vcpu->arch.apic; +	int idx;  	if (!apic || !apic->vapic_addr)  		return; -	down_read(&vcpu->kvm->slots_lock); +	idx = srcu_read_lock(&vcpu->kvm->srcu);  	kvm_release_page_dirty(apic->vapic_page);  	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); -	up_read(&vcpu->kvm->slots_lock); +	srcu_read_unlock(&vcpu->kvm->srcu, idx);  }  static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  			r = 0;  			goto out;  		} +		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { +			vcpu->fpu_active = 0; +			kvm_x86_ops->fpu_deactivate(vcpu); +		}  	}  	preempt_disable();  	kvm_x86_ops->prepare_guest_switch(vcpu); -	kvm_load_guest_fpu(vcpu); +	if (vcpu->fpu_active) +		kvm_load_guest_fpu(vcpu);  	local_irq_disable(); @@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  		kvm_lapic_sync_to_vapic(vcpu);  	} -	up_read(&vcpu->kvm->slots_lock); +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);  	kvm_guest_enter(); @@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	preempt_enable(); -	down_read(&vcpu->kvm->slots_lock); +	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);  	/*  	 * Profile KVM exit RIPs: @@ -3973,6 +4389,7 @@ out:  static int __vcpu_run(struct kvm_vcpu *vcpu)  {  	int r; +	struct kvm *kvm = vcpu->kvm;  	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {  		pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;  	} -	down_read(&vcpu->kvm->slots_lock); +	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);  	vapic_enter(vcpu);  	r = 1; @@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)  			r = vcpu_enter_guest(vcpu);  		else { -			up_read(&vcpu->kvm->slots_lock); +			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);  			kvm_vcpu_block(vcpu); -			down_read(&vcpu->kvm->slots_lock); +			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);  			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))  			{  				switch(vcpu->arch.mp_state) { @@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  			++vcpu->stat.signal_exits;  		}  		if (need_resched()) { -			up_read(&vcpu->kvm->slots_lock); +			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);  			kvm_resched(vcpu); -			down_read(&vcpu->kvm->slots_lock); +			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);  		}  	} -	up_read(&vcpu->kvm->slots_lock); +	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);  	post_kvm_run_save(vcpu);  	vapic_exit(vcpu); @@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  		vcpu->mmio_read_completed = 1;  		vcpu->mmio_needed = 0; -		down_read(&vcpu->kvm->slots_lock); +		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);  		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,  					EMULTYPE_NO_DECODE); -		up_read(&vcpu->kvm->slots_lock); +		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);  		if (r == EMULATE_DO_MMIO) {  			/*  			 * Read-modify-write.  Back to userspace. @@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,  	sregs->gdt.limit = dt.limit;  	sregs->gdt.base = dt.base; -	kvm_x86_ops->decache_cr4_guest_bits(vcpu); -	sregs->cr0 = vcpu->arch.cr0; +	sregs->cr0 = kvm_read_cr0(vcpu);  	sregs->cr2 = vcpu->arch.cr2;  	sregs->cr3 = vcpu->arch.cr3; -	sregs->cr4 = vcpu->arch.cr4; +	sregs->cr4 = kvm_read_cr4(vcpu);  	sregs->cr8 = kvm_get_cr8(vcpu); -	sregs->efer = vcpu->arch.shadow_efer; +	sregs->efer = vcpu->arch.efer;  	sregs->apic_base = kvm_get_apic_base(vcpu);  	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,  {  	struct descriptor_table dtable;  	u16 index = selector >> 3; +	int ret; +	u32 err; +	gva_t addr;  	get_segment_descriptor_dtable(vcpu, selector, &dtable);  	if (dtable.limit < index * 8 + 7) {  		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); -		return 1; +		return X86EMUL_PROPAGATE_FAULT;  	} -	return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); +	addr = dtable.base + index * 8; +	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), +					 vcpu,  &err); +	if (ret == X86EMUL_PROPAGATE_FAULT) +		kvm_inject_page_fault(vcpu, addr, err); + +       return ret;  }  /* allowed just for 8 bytes segments */ @@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,  	if (dtable.limit < index * 8 + 7)  		return 1; -	return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); +	return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, +			       struct desc_struct *seg_desc) +{ +	u32 base_addr = get_desc_base(seg_desc); + +	return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);  } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,  			     struct desc_struct *seg_desc)  {  	u32 base_addr = get_desc_base(seg_desc); -	return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); +	return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);  }  static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)  	return kvm_seg.selector;  } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, -						u16 selector, -						struct kvm_segment *kvm_seg) -{ -	struct desc_struct seg_desc; - -	if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) -		return 1; -	seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); -	return 0; -} -  static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)  {  	struct kvm_segment segvar = { @@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se  		.unusable = 0,  	};  	kvm_x86_ops->set_segment(vcpu, &segvar, seg); -	return 0; +	return X86EMUL_CONTINUE;  }  static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)  		(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);  } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, -				int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)  {  	struct kvm_segment kvm_seg; +	struct desc_struct seg_desc; +	u8 dpl, rpl, cpl; +	unsigned err_vec = GP_VECTOR; +	u32 err_code = 0; +	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ +	int ret; -	if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) +	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))  		return kvm_load_realmode_segment(vcpu, selector, seg); -	if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) -		return 1; -	kvm_seg.type |= type_bits; -	if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && -	    seg != VCPU_SREG_LDTR) -		if (!kvm_seg.s) -			kvm_seg.unusable = 1; +	/* NULL selector is not valid for TR, CS and SS */ +	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) +	    && null_selector) +		goto exception; + +	/* TR should be in GDT only */ +	if (seg == VCPU_SREG_TR && (selector & (1 << 2))) +		goto exception; + +	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); +	if (ret) +		return ret; + +	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + +	if (null_selector) { /* for NULL selector skip all following checks */ +		kvm_seg.unusable = 1; +		goto load; +	} + +	err_code = selector & 0xfffc; +	err_vec = GP_VECTOR; +	/* can't load system descriptor into segment selecor */ +	if (seg <= VCPU_SREG_GS && !kvm_seg.s) +		goto exception; + +	if (!kvm_seg.present) { +		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; +		goto exception; +	} + +	rpl = selector & 3; +	dpl = kvm_seg.dpl; +	cpl = kvm_x86_ops->get_cpl(vcpu); + +	switch (seg) { +	case VCPU_SREG_SS: +		/* +		 * segment is not a writable data segment or segment +		 * selector's RPL != CPL or segment selector's RPL != CPL +		 */ +		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) +			goto exception; +		break; +	case VCPU_SREG_CS: +		if (!(kvm_seg.type & 8)) +			goto exception; + +		if (kvm_seg.type & 4) { +			/* conforming */ +			if (dpl > cpl) +				goto exception; +		} else { +			/* nonconforming */ +			if (rpl > cpl || dpl != cpl) +				goto exception; +		} +		/* CS(RPL) <- CPL */ +		selector = (selector & 0xfffc) | cpl; +            break; +	case VCPU_SREG_TR: +		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) +			goto exception; +		break; +	case VCPU_SREG_LDTR: +		if (kvm_seg.s || kvm_seg.type != 2) +			goto exception; +		break; +	default: /*  DS, ES, FS, or GS */ +		/* +		 * segment is not a data or readable code segment or +		 * ((segment is a data or nonconforming code segment) +		 * and (both RPL and CPL > DPL)) +		 */ +		if ((kvm_seg.type & 0xa) == 0x8 || +		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) +			goto exception; +		break; +	} + +	if (!kvm_seg.unusable && kvm_seg.s) { +		/* mark segment as accessed */ +		kvm_seg.type |= 1; +		seg_desc.type |= 1; +		save_guest_segment_descriptor(vcpu, selector, &seg_desc); +	} +load:  	kvm_set_segment(vcpu, &kvm_seg, seg); -	return 0; +	return X86EMUL_CONTINUE; +exception: +	kvm_queue_exception_e(vcpu, err_vec, err_code); +	return X86EMUL_PROPAGATE_FAULT;  }  static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,  	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);  } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ +	struct kvm_segment kvm_seg; +	kvm_get_segment(vcpu, &kvm_seg, seg); +	kvm_seg.selector = sel; +	kvm_set_segment(vcpu, &kvm_seg, seg); +} +  static int load_state_from_tss32(struct kvm_vcpu *vcpu,  				  struct tss_segment_32 *tss)  { @@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,  	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);  	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); -	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) +	/* +	 * SDM says that segment selectors are loaded before segment +	 * descriptors +	 */ +	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); +	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); +	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); +	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); +	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); +	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); +	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + +	/* +	 * Now load segment descriptors. If fault happenes at this stage +	 * it is handled in a context of new task +	 */ +	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) +	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) +	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) +	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) +	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) +	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) +	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))  		return 1;  	return 0;  } @@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,  	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);  	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); -	if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) +	/* +	 * SDM says that segment selectors are loaded before segment +	 * descriptors +	 */ +	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); +	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); +	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); +	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); +	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + +	/* +	 * Now load segment descriptors. If fault happenes at this stage +	 * it is handled in a context of new task +	 */ +	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) +	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) +	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) +	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))  		return 1; -	if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) +	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))  		return 1;  	return 0;  } @@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,  			    sizeof tss_segment_16))  		goto out; -	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), +	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),  			   &tss_segment_16, sizeof tss_segment_16))  		goto out; @@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,  		tss_segment_16.prev_task_link = old_tss_sel;  		if (kvm_write_guest(vcpu->kvm, -				    get_tss_base_addr(vcpu, nseg_desc), +				    get_tss_base_addr_write(vcpu, nseg_desc),  				    &tss_segment_16.prev_task_link,  				    sizeof tss_segment_16.prev_task_link))  			goto out; @@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,  			    sizeof tss_segment_32))  		goto out; -	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), +	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),  			   &tss_segment_32, sizeof tss_segment_32))  		goto out; @@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,  		tss_segment_32.prev_task_link = old_tss_sel;  		if (kvm_write_guest(vcpu->kvm, -				    get_tss_base_addr(vcpu, nseg_desc), +				    get_tss_base_addr_write(vcpu, nseg_desc),  				    &tss_segment_32.prev_task_link,  				    sizeof tss_segment_32.prev_task_link))  			goto out; @@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)  	u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);  	u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); -	old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); +	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);  	/* FIXME: Handle errors. Failure to read either TSS or their  	 * descriptors should generate a pagefault. @@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)  					      &nseg_desc);  	} -	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); +	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);  	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);  	tr_seg.type = 11;  	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  	kvm_set_cr8(vcpu, sregs->cr8); -	mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; +	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;  	kvm_x86_ops->set_efer(vcpu, sregs->efer);  	kvm_set_apic_base(vcpu, sregs->apic_base); -	kvm_x86_ops->decache_cr4_guest_bits(vcpu); - -	mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; +	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;  	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);  	vcpu->arch.cr0 = sregs->cr0; -	mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; +	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;  	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);  	if (!is_long_mode(vcpu) && is_pae(vcpu)) {  		load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  	/* Older userspace won't unhalt the vcpu on reset. */  	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&  	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && -	    !(vcpu->arch.cr0 & X86_CR0_PE)) +	    !is_protmode(vcpu))  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;  	vcpu_put(vcpu); @@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,  {  	unsigned long vaddr = tr->linear_address;  	gpa_t gpa; +	int idx;  	vcpu_load(vcpu); -	down_read(&vcpu->kvm->slots_lock); -	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); -	up_read(&vcpu->kvm->slots_lock); +	idx = srcu_read_lock(&vcpu->kvm->srcu); +	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); +	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	tr->physical_address = gpa;  	tr->valid = gpa != UNMAPPED_GVA;  	tr->writeable = 1; @@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init);  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)  { -	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) +	if (vcpu->guest_fpu_loaded)  		return;  	vcpu->guest_fpu_loaded = 1;  	kvm_fx_save(&vcpu->arch.host_fx_image);  	kvm_fx_restore(&vcpu->arch.guest_fx_image); +	trace_kvm_fpu(1);  } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  { @@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  	kvm_fx_save(&vcpu->arch.guest_fx_image);  	kvm_fx_restore(&vcpu->arch.host_fx_image);  	++vcpu->stat.fpu_reload; +	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); +	trace_kvm_fpu(0);  } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)  { @@ -5088,11 +5635,13 @@ fail:  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)  { +	int idx; +  	kfree(vcpu->arch.mce_banks);  	kvm_free_lapic(vcpu); -	down_read(&vcpu->kvm->slots_lock); +	idx = srcu_read_lock(&vcpu->kvm->srcu);  	kvm_mmu_destroy(vcpu); -	up_read(&vcpu->kvm->slots_lock); +	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	free_page((unsigned long)vcpu->arch.pio_data);  } @@ -5103,6 +5652,12 @@ struct  kvm *kvm_arch_create_vm(void)  	if (!kvm)  		return ERR_PTR(-ENOMEM); +	kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); +	if (!kvm->arch.aliases) { +		kfree(kvm); +		return ERR_PTR(-ENOMEM); +	} +  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)  		put_page(kvm->arch.apic_access_page);  	if (kvm->arch.ept_identity_pagetable)  		put_page(kvm->arch.ept_identity_pagetable); +	cleanup_srcu_struct(&kvm->srcu); +	kfree(kvm->arch.aliases);  	kfree(kvm);  } -int kvm_arch_set_memory_region(struct kvm *kvm, -				struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, +				struct kvm_memory_slot *memslot,  				struct kvm_memory_slot old, +				struct kvm_userspace_memory_region *mem,  				int user_alloc)  { -	int npages = mem->memory_size >> PAGE_SHIFT; -	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; +	int npages = memslot->npages;  	/*To keep backward compatibility with older userspace,  	 *x86 needs to hanlde !user_alloc case. @@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,  			if (IS_ERR((void *)userspace_addr))  				return PTR_ERR((void *)userspace_addr); -			/* set userspace_addr atomically for kvm_hva_to_rmapp */ -			spin_lock(&kvm->mmu_lock);  			memslot->userspace_addr = userspace_addr; -			spin_unlock(&kvm->mmu_lock); -		} else { -			if (!old.user_alloc && old.rmap) { -				int ret; - -				down_write(¤t->mm->mmap_sem); -				ret = do_munmap(current->mm, old.userspace_addr, -						old.npages * PAGE_SIZE); -				up_write(¤t->mm->mmap_sem); -				if (ret < 0) -					printk(KERN_WARNING -				       "kvm_vm_ioctl_set_memory_region: " -				       "failed to munmap memory\n"); -			}  		}  	} + +	return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, +				struct kvm_userspace_memory_region *mem, +				struct kvm_memory_slot old, +				int user_alloc) +{ + +	int npages = mem->memory_size >> PAGE_SHIFT; + +	if (!user_alloc && !old.user_alloc && old.rmap && !npages) { +		int ret; + +		down_write(¤t->mm->mmap_sem); +		ret = do_munmap(current->mm, old.userspace_addr, +				old.npages * PAGE_SIZE); +		up_write(¤t->mm->mmap_sem); +		if (ret < 0) +			printk(KERN_WARNING +			       "kvm_vm_ioctl_set_memory_region: " +			       "failed to munmap memory\n"); +	} +  	spin_lock(&kvm->mmu_lock);  	if (!kvm->arch.n_requested_mmu_pages) {  		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,  	kvm_mmu_slot_remove_write_access(kvm, mem->slot);  	spin_unlock(&kvm->mmu_lock); - -	return 0;  }  void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2a..2d101639bd8d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@  #define ARCH_X86_KVM_X86_H  #include <linux/kvm_host.h> +#include "kvm_cache_regs.h"  static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)  { @@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)  struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,                                               u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ +	return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 +	return vcpu->arch.efer & EFER_LMA; +#else +	return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ +	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ +	return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ +	return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} +  #endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e71c5cbc8f35..452ee5b8f309 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -331,11 +331,23 @@ int devmem_is_allowed(unsigned long pagenr)  void free_init_pages(char *what, unsigned long begin, unsigned long end)  { -	unsigned long addr = begin; +	unsigned long addr; +	unsigned long begin_aligned, end_aligned; -	if (addr >= end) +	/* Make sure boundaries are page aligned */ +	begin_aligned = PAGE_ALIGN(begin); +	end_aligned   = end & PAGE_MASK; + +	if (WARN_ON(begin_aligned != begin || end_aligned != end)) { +		begin = begin_aligned; +		end   = end_aligned; +	} + +	if (begin >= end)  		return; +	addr = begin; +  	/*  	 * If debugging page accesses then do not free this memory but  	 * mark them not present - any buggy init-section access will @@ -343,7 +355,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)  	 */  #ifdef CONFIG_DEBUG_PAGEALLOC  	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", -		begin, PAGE_ALIGN(end)); +		begin, end);  	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);  #else  	/* @@ -358,8 +370,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)  	for (; addr < end; addr += PAGE_SIZE) {  		ClearPageReserved(virt_to_page(addr));  		init_page_count(virt_to_page(addr)); -		memset((void *)(addr & ~(PAGE_SIZE-1)), -			POISON_FREE_INITMEM, PAGE_SIZE); +		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);  		free_page(addr);  		totalram_pages++;  	} @@ -376,6 +387,15 @@ void free_initmem(void)  #ifdef CONFIG_BLK_DEV_INITRD  void free_initrd_mem(unsigned long start, unsigned long end)  { -	free_init_pages("initrd memory", start, end); +	/* +	 * end could be not aligned, and We can not align that, +	 * decompresser could be confused by aligned initrd_end +	 * We already reserve the end partial page before in +	 *   - i386_start_kernel() +	 *   - x86_64_start_kernel() +	 *   - relocate_initrd() +	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed +	 */ +	free_init_pages("initrd memory", start, PAGE_ALIGN(end));  }  #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1d4eb93d333c..cf07c26d9a4a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -291,8 +291,29 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,  	 */  	if (kernel_set_to_readonly &&  	    within(address, (unsigned long)_text, -		   (unsigned long)__end_rodata_hpage_align)) -		pgprot_val(forbidden) |= _PAGE_RW; +		   (unsigned long)__end_rodata_hpage_align)) { +		unsigned int level; + +		/* +		 * Don't enforce the !RW mapping for the kernel text mapping, +		 * if the current mapping is already using small page mapping. +		 * No need to work hard to preserve large page mappings in this +		 * case. +		 * +		 * This also fixes the Linux Xen paravirt guest boot failure +		 * (because of unexpected read-only mappings for kernel identity +		 * mappings). In this paravirt guest case, the kernel text +		 * mapping and the kernel identity mapping share the same +		 * page-table pages. Thus we can't really use different +		 * protections for the kernel text and identity mappings. Also, +		 * these shared mappings are made of small page mappings. +		 * Thus this don't enforce !RW mapping for small page kernel +		 * text mapping logic will help Linux Xen parvirt guest boot +		 * aswell. +		 */ +		if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) +			pgprot_val(forbidden) |= _PAGE_RW; +	}  #endif  	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6a58256dce9f..090cbbec7dbd 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -46,17 +46,6 @@  static unsigned long reset_value[NUM_VIRT_COUNTERS]; -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN		(1ULL<<57) -#define IBS_FETCH_VAL			(1ULL<<49) -#define IBS_FETCH_ENABLE		(1ULL<<48) -#define IBS_FETCH_CNT_MASK		0xFFFF0000ULL - -/* IbsOpCtl bits */ -#define IBS_OP_CNT_CTL			(1ULL<<19) -#define IBS_OP_VAL			(1ULL<<18) -#define IBS_OP_ENABLE			(1ULL<<17) -  #define IBS_FETCH_SIZE			6  #define IBS_OP_SIZE			12 @@ -182,7 +171,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,  			continue;  		}  		rdmsrl(msrs->controls[i].addr, val); -		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) +		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)  			op_x86_warn_in_use(i);  		val &= model->reserved;  		wrmsrl(msrs->controls[i].addr, val); @@ -290,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,  			oprofile_write_commit(&entry);  			/* reenable the IRQ */ -			ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); +			ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);  			ctl |= IBS_FETCH_ENABLE;  			wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);  		} @@ -330,7 +319,7 @@ static inline void op_amd_start_ibs(void)  		return;  	if (ibs_config.fetch_enabled) { -		val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; +		val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;  		val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;  		val |= IBS_FETCH_ENABLE;  		wrmsrl(MSR_AMD64_IBSFETCHCTL, val); @@ -352,7 +341,7 @@ static inline void op_amd_start_ibs(void)  			 * avoid underflows.  			 */  			ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, -					 0xFFFFULL); +					 IBS_OP_MAX_CNT);  		}  		if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)  			ibs_op_ctl |= IBS_OP_CNT_CTL; @@ -409,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs)  		if (!reset_value[op_x86_phys_to_virt(i)])  			continue;  		rdmsrl(msrs->controls[i].addr, val); -		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		val |= ARCH_PERFMON_EVENTSEL_ENABLE;  		wrmsrl(msrs->controls[i].addr, val);  	} @@ -429,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)  		if (!reset_value[op_x86_phys_to_virt(i)])  			continue;  		rdmsrl(msrs->controls[i].addr, val); -		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;  		wrmsrl(msrs->controls[i].addr, val);  	} diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 5d1727ba409e..2bf90fafa7b5 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -88,7 +88,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,  			continue;  		}  		rdmsrl(msrs->controls[i].addr, val); -		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) +		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)  			op_x86_warn_in_use(i);  		val &= model->reserved;  		wrmsrl(msrs->controls[i].addr, val); @@ -166,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs)  	for (i = 0; i < num_counters; ++i) {  		if (reset_value[i]) {  			rdmsrl(msrs->controls[i].addr, val); -			val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +			val |= ARCH_PERFMON_EVENTSEL_ENABLE;  			wrmsrl(msrs->controls[i].addr, val);  		}  	} @@ -184,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs)  		if (!reset_value[i])  			continue;  		rdmsrl(msrs->controls[i].addr, val); -		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;  		wrmsrl(msrs->controls[i].addr, val);  	}  } diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 0b7d3e9593e1..b110d97fb925 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -13,6 +13,8 @@ obj-$(CONFIG_X86_VISWS)		+= visws.o  obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o +obj-$(CONFIG_X86_MRST)		+= mrst.o +  obj-y				+= common.o early.o  obj-y				+= amd_bus.o bus_numa.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5f11ff6f5389..e31160216efb 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -122,8 +122,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data)  	struct acpi_resource_address64 addr;  	acpi_status status;  	unsigned long flags; -	struct resource *root; -	u64 start, end; +	struct resource *root, *conflict; +	u64 start, end, max_len;  	status = resource_to_addr(acpi_res, &addr);  	if (!ACPI_SUCCESS(status)) @@ -140,6 +140,17 @@ setup_resource(struct acpi_resource *acpi_res, void *data)  	} else  		return AE_OK; +	max_len = addr.maximum - addr.minimum + 1; +	if (addr.address_length > max_len) { +		dev_printk(KERN_DEBUG, &info->bridge->dev, +			   "host bridge window length %#llx doesn't fit in " +			   "%#llx-%#llx, trimming\n", +			   (unsigned long long) addr.address_length, +			   (unsigned long long) addr.minimum, +			   (unsigned long long) addr.maximum); +		addr.address_length = max_len; +	} +  	start = addr.minimum + addr.translation_offset;  	end = start + addr.address_length - 1; @@ -157,9 +168,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data)  		return AE_OK;  	} -	if (insert_resource(root, res)) { +	conflict = insert_resource_conflict(root, res); +	if (conflict) {  		dev_err(&info->bridge->dev, -			"can't allocate host bridge window %pR\n", res); +			"address space collision: host bridge window %pR " +			"conflicts with %s %pR\n", +			res, conflict->name, conflict);  	} else {  		pci_bus_add_resource(info->bus, res, 0);  		info->res_num++; @@ -298,17 +312,14 @@ int __init pci_acpi_init(void)  {  	struct pci_dev *dev = NULL; -	if (pcibios_scanned) -		return 0; -  	if (acpi_noirq) -		return 0; +		return -ENODEV;  	printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");  	acpi_irq_penalty_init(); -	pcibios_scanned++;  	pcibios_enable_irq = acpi_pci_irq_enable;  	pcibios_disable_irq = acpi_pci_irq_disable; +	x86_init.pci.init_irq = x86_init_noop;  	if (pci_routeirq) {  		/* diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3736176acaab..294e10cb11e1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -72,12 +72,6 @@ struct pci_ops pci_root_ops = {  };  /* - * legacy, numa, and acpi all want to call pcibios_scan_root - * from their initcalls. This flag prevents that. - */ -int pcibios_scanned; - -/*   * This interrupt-safe spinlock protects all accesses to PCI   * configuration space.   */ diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index dece3eb9c906..46fd43f79103 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -127,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)  					continue;  				if (!r->start ||  				    pci_claim_resource(dev, idx) < 0) { -					dev_info(&dev->dev, -						 "can't reserve window %pR\n", -						 r);  					/*  					 * Something is wrong with the region.  					 * Invalidate the resource to prevent @@ -181,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass)  					"BAR %d: reserving %pr (d=%d, p=%d)\n",  					idx, r, disabled, pass);  				if (pci_claim_resource(dev, idx) < 0) { -					dev_info(&dev->dev, -						 "can't reserve %pR\n", r);  					/* We'll assign a new address later */  					r->end -= r->start;  					r->start = 0; diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 25a1f8efed4a..adb62aaa7ecd 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,7 @@  #include <linux/pci.h>  #include <linux/init.h>  #include <asm/pci_x86.h> +#include <asm/x86_init.h>  /* arch_initcall has too random ordering, so call the initializers     in the right sequence from here. */ @@ -15,10 +16,9 @@ static __init int pci_arch_init(void)  	if (!(pci_probe & PCI_PROBE_NOEARLY))  		pci_mmcfg_early_init(); -#ifdef CONFIG_PCI_OLPC -	if (!pci_olpc_init()) -		return 0;	/* skip additional checks if it's an XO */ -#endif +	if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) +		return 0; +  #ifdef CONFIG_PCI_BIOS  	pci_pcbios_init();  #endif diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index b02f6d8ac922..8b107521d24e 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -53,7 +53,7 @@ struct irq_router_handler {  	int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);  }; -int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; +int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;  void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;  /* @@ -1018,7 +1018,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)  	return 1;  } -static void __init pcibios_fixup_irqs(void) +void __init pcibios_fixup_irqs(void)  {  	struct pci_dev *dev = NULL;  	u8 pin; @@ -1112,12 +1112,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {  	{ }  }; -int __init pcibios_irq_init(void) +void __init pcibios_irq_init(void)  {  	DBG(KERN_DEBUG "PCI: IRQ init\n"); -	if (pcibios_enable_irq || raw_pci_ops == NULL) -		return 0; +	if (raw_pci_ops == NULL) +		return;  	dmi_check_system(pciirq_dmi_table); @@ -1144,9 +1144,7 @@ int __init pcibios_irq_init(void)  			pirq_table = NULL;  	} -	pcibios_enable_irq = pirq_enable_irq; - -	pcibios_fixup_irqs(); +	x86_init.pci.fixup_irqs();  	if (io_apic_assign_pci_irqs && pci_routeirq) {  		struct pci_dev *dev = NULL; @@ -1159,8 +1157,6 @@ int __init pcibios_irq_init(void)  		for_each_pci_dev(dev)  			pirq_enable_irq(dev);  	} - -	return 0;  }  static void pirq_penalize_isa_irq(int irq, int active) diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 4061bb0f267d..0db5eaf54560 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void)  	}  } -static int __init pci_legacy_init(void) +int __init pci_legacy_init(void)  {  	if (!raw_pci_ops) {  		printk("PCI: System does not support PCI\n");  		return 0;  	} -	if (pcibios_scanned++) -		return 0; -  	printk("PCI: Probing PCI hardware\n");  	pci_root_bus = pcibios_scan_root(0);  	if (pci_root_bus) @@ -55,18 +52,15 @@ static int __init pci_legacy_init(void)  int __init pci_subsys_init(void)  { -#ifdef CONFIG_X86_NUMAQ -	pci_numaq_init(); -#endif -#ifdef CONFIG_ACPI -	pci_acpi_init(); -#endif -#ifdef CONFIG_X86_VISWS -	pci_visws_init(); -#endif -	pci_legacy_init(); +	/* +	 * The init function returns an non zero value when +	 * pci_legacy_init should be invoked. +	 */ +	if (x86_init.pci.init()) +		pci_legacy_init(); +  	pcibios_fixup_peer_bridges(); -	pcibios_irq_init(); +	x86_init.pci.init_irq();  	pcibios_init();  	return 0; diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c new file mode 100644 index 000000000000..8bf2fcb88d04 --- /dev/null +++ b/arch/x86/pci/mrst.c @@ -0,0 +1,262 @@ +/* + * Moorestown PCI support + *   Copyright (c) 2008 Intel Corporation + *     Jesse Barnes <jesse.barnes@intel.com> + * + * Moorestown has an interesting PCI implementation: + *   - configuration space is memory mapped (as defined by MCFG) + *   - Lincroft devices also have a real, type 1 configuration space + *   - Early Lincroft silicon has a type 1 access bug that will cause + *     a hang if non-existent devices are accessed + *   - some devices have the "fixed BAR" capability, which means + *     they can't be relocated or modified; check for that during + *     BAR sizing + * + * So, we use the MCFG space for all reads and writes, but also send + * Lincroft writes to type 1 space.  But only read/write if the device + * actually exists, otherwise return all 1s for reads and bit bucket + * the writes. + */ + +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/dmi.h> + +#include <asm/acpi.h> +#include <asm/segment.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/pci_x86.h> +#include <asm/hw_irq.h> +#include <asm/io_apic.h> + +#define PCIE_CAP_OFFSET	0x100 + +/* Fixed BAR fields */ +#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00	/* Fixed BAR (TBD) */ +#define PCI_FIXED_BAR_0_SIZE	0x04 +#define PCI_FIXED_BAR_1_SIZE	0x08 +#define PCI_FIXED_BAR_2_SIZE	0x0c +#define PCI_FIXED_BAR_3_SIZE	0x10 +#define PCI_FIXED_BAR_4_SIZE	0x14 +#define PCI_FIXED_BAR_5_SIZE	0x1c + +/** + * fixed_bar_cap - return the offset of the fixed BAR cap if found + * @bus: PCI bus + * @devfn: device in question + * + * Look for the fixed BAR cap on @bus and @devfn, returning its offset + * if found or 0 otherwise. + */ +static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) +{ +	int pos; +	u32 pcie_cap = 0, cap_data; + +	pos = PCIE_CAP_OFFSET; + +	if (!raw_pci_ext_ops) +		return 0; + +	while (pos) { +		if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, +					  devfn, pos, 4, &pcie_cap)) +			return 0; + +		if (pcie_cap == 0xffffffff) +			return 0; + +		if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { +			raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, +					      devfn, pos + 4, 4, &cap_data); +			if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR) +				return pos; +		} + +		pos = pcie_cap >> 20; +	} + +	return 0; +} + +static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, +				   int reg, int len, u32 val, int offset) +{ +	u32 size; +	unsigned int domain, busnum; +	int bar = (reg - PCI_BASE_ADDRESS_0) >> 2; + +	domain = pci_domain_nr(bus); +	busnum = bus->number; + +	if (val == ~0 && len == 4) { +		unsigned long decode; + +		raw_pci_ext_ops->read(domain, busnum, devfn, +			       offset + 8 + (bar * 4), 4, &size); + +		/* Turn the size into a decode pattern for the sizing code */ +		if (size) { +			decode = size - 1; +			decode |= decode >> 1; +			decode |= decode >> 2; +			decode |= decode >> 4; +			decode |= decode >> 8; +			decode |= decode >> 16; +			decode++; +			decode = ~(decode - 1); +		} else { +			decode = ~0; +		} + +		/* +		 * If val is all ones, the core code is trying to size the reg, +		 * so update the mmconfig space with the real size. +		 * +		 * Note: this assumes the fixed size we got is a power of two. +		 */ +		return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4, +				       decode); +	} + +	/* This is some other kind of BAR write, so just do it. */ +	return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val); +} + +/** + * type1_access_ok - check whether to use type 1 + * @bus: bus number + * @devfn: device & function in question + * + * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at + * all, the we can go ahead with any reads & writes.  If it's on a Lincroft, + * but doesn't exist, avoid the access altogether to keep the chip from + * hanging. + */ +static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) +{ +	/* This is a workaround for A0 LNC bug where PCI status register does +	 * not have new CAP bit set. can not be written by SW either. +	 * +	 * PCI header type in real LNC indicates a single function device, this +	 * will prevent probing other devices under the same function in PCI +	 * shim. Therefore, use the header type in shim instead. +	 */ +	if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) +		return 0; +	if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) +		return 1; +	return 0; /* langwell on others */ +} + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, +		    int size, u32 *value) +{ +	if (type1_access_ok(bus->number, devfn, where)) +		return pci_direct_conf1.read(pci_domain_nr(bus), bus->number, +					devfn, where, size, value); +	return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, +			      devfn, where, size, value); +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, +		     int size, u32 value) +{ +	int offset; + +	/* On MRST, there is no PCI ROM BAR, this will cause a subsequent read +	 * to ROM BAR return 0 then being ignored. +	 */ +	if (where == PCI_ROM_ADDRESS) +		return 0; + +	/* +	 * Devices with fixed BARs need special handling: +	 *   - BAR sizing code will save, write ~0, read size, restore +	 *   - so writes to fixed BARs need special handling +	 *   - other writes to fixed BAR devices should go through mmconfig +	 */ +	offset = fixed_bar_cap(bus, devfn); +	if (offset && +	    (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) { +		return pci_device_update_fixed(bus, devfn, where, size, value, +					       offset); +	} + +	/* +	 * On Moorestown update both real & mmconfig space +	 * Note: early Lincroft silicon can't handle type 1 accesses to +	 *       non-existent devices, so just eat the write in that case. +	 */ +	if (type1_access_ok(bus->number, devfn, where)) +		return pci_direct_conf1.write(pci_domain_nr(bus), bus->number, +					      devfn, where, size, value); +	return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn, +			       where, size, value); +} + +static int mrst_pci_irq_enable(struct pci_dev *dev) +{ +	u8 pin; +	struct io_apic_irq_attr irq_attr; + +	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + +	/* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to +	 * IOAPIC RTE entries, so we just enable RTE for the device. +	 */ +	irq_attr.ioapic = mp_find_ioapic(dev->irq); +	irq_attr.ioapic_pin = dev->irq; +	irq_attr.trigger = 1; /* level */ +	irq_attr.polarity = 1; /* active low */ +	io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); + +	return 0; +} + +struct pci_ops pci_mrst_ops = { +	.read = pci_read, +	.write = pci_write, +}; + +/** + * pci_mrst_init - installs pci_mrst_ops + * + * Moorestown has an interesting PCI implementation (see above). + * Called when the early platform detection installs it. + */ +int __init pci_mrst_init(void) +{ +	printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); +	pci_mmcfg_late_init(); +	pcibios_enable_irq = mrst_pci_irq_enable; +	pci_root_ops = pci_mrst_ops; +	/* Continue with standard init */ +	return 1; +} + +/* + * Langwell devices reside at fixed offsets, don't try to move them. + */ +static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) +{ +	unsigned long offset; +	u32 size; +	int i; + +	/* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ +	offset = fixed_bar_cap(dev->bus, dev->devfn); +	if (!offset || PCI_DEVFN(2, 0) == dev->devfn || +	    PCI_DEVFN(2, 2) == dev->devfn) +		return; + +	for (i = 0; i < PCI_ROM_RESOURCE; i++) { +		pci_read_config_dword(dev, offset + 8 + (i * 4), &size); +		dev->resource[i].end = dev->resource[i].start + size - 1; +		dev->resource[i].flags |= IORESOURCE_PCI_FIXED; +	} +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup); diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8884a1c1ada6..8223738ad806 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -148,14 +148,8 @@ int __init pci_numaq_init(void)  {  	int quad; -	if (!found_numaq) -		return 0; -  	raw_pci_ops = &pci_direct_conf1_mq; -	if (pcibios_scanned++) -		return 0; -  	pci_root_bus = pcibios_scan_root(0);  	if (pci_root_bus)  		pci_bus_add_devices(pci_root_bus); diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b889d824f7c6..b34815408f58 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = {  int __init pci_olpc_init(void)  { -	if (!machine_is_olpc() || olpc_has_vsa()) -		return -ENODEV; -  	printk(KERN_INFO "PCI: Using configuration type OLPC\n");  	raw_pci_ops = &pci_olpc_conf;  	is_lx = is_geode_lx(); diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index bcead7a46871..03008f72eb04 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)  int __init pci_visws_init(void)  { -	if (!is_visws_box()) -		return -1; -  	pcibios_enable_irq = &pci_visws_enable_irq;  	pcibios_disable_irq = &pci_visws_disable_irq; @@ -90,5 +87,6 @@ int __init pci_visws_init(void)  	pci_scan_bus_with_sysdata(pci_bus1);  	pci_fixup_irqs(pci_common_swizzle, visws_map_irq);  	pcibios_resource_survey(); -	return 0; +	/* Request bus scan */ +	return 1;  } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 563d20504988..deafb65ef44e 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -361,7 +361,7 @@ static void xen_cpu_die(unsigned int cpu)  		alternatives_smp_switch(0);  } -static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */ +static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */  {  	play_dead_common();  	HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); | 
