diff options
Diffstat (limited to 'arch/x86')
144 files changed, 3286 insertions, 1655 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced2..65a872bf72f9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,8 +69,8 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_DEBUG_KMEMLEAK select ANON_INODES - select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 - select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_CMPXCHG_LOCAL select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER @@ -106,12 +106,13 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select HAVE_RCU_USER_QS if X86_64 + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select GENERIC_KERNEL_THREAD select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 + select CLONE_BACKWARDS if X86_32 config INSTRUCTION_DECODER def_bool y @@ -171,13 +172,8 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API -config RWSEM_GENERIC_SPINLOCK - def_bool y - depends on !X86_XADD - config RWSEM_XCHGADD_ALGORITHM def_bool y - depends on X86_XADD config GENERIC_CALIBRATE_DELAY def_bool y @@ -310,7 +306,7 @@ config X86_X2APIC If you don't know what to do here, say N. config X86_MPPARSE - bool "Enable MPS table" if ACPI + bool "Enable MPS table" if ACPI || SFI default y depends on X86_LOCAL_APIC ---help--- @@ -374,6 +370,7 @@ config X86_NUMACHIP depends on NUMA depends on SMP depends on X86_X2APIC + depends on PCI_MMCONFIG ---help--- Adds support for Numascale NumaChip large-SMP systems. Needed to enable more than ~168 cores. @@ -1100,7 +1097,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M386 && !M486 + depends on !M486 select X86_PAE ---help--- Select this if you have a 32-bit processor and more than 4 @@ -1698,6 +1695,50 @@ config HOTPLUG_CPU automatically on SMP systems. ) Say N if you want to disable CPU hotplug. +config BOOTPARAM_HOTPLUG_CPU0 + bool "Set default setting of cpu0_hotpluggable" + default n + depends on HOTPLUG_CPU && EXPERIMENTAL + ---help--- + Set whether default state of cpu0_hotpluggable is on or off. + + Say Y here to enable CPU0 hotplug by default. If this switch + is turned on, there is no need to give cpu0_hotplug kernel + parameter and the CPU0 hotplug feature is enabled by default. + + Please note: there are two known CPU0 dependencies if you want + to enable the CPU0 hotplug feature either by this switch or by + cpu0_hotplug kernel parameter. + + First, resume from hibernate or suspend always starts from CPU0. + So hibernate and suspend are prevented if CPU0 is offline. + + Second dependency is PIC interrupts always go to CPU0. CPU0 can not + offline if any interrupt can not migrate out of CPU0. There may + be other CPU0 dependencies. + + Please make sure the dependencies are under your control before + you enable this feature. + + Say N if you don't want to enable CPU0 hotplug feature by default. + You still can enable the CPU0 hotplug feature at boot by kernel + parameter cpu0_hotplug. + +config DEBUG_HOTPLUG_CPU0 + def_bool n + prompt "Debug CPU0 hotplug" + depends on HOTPLUG_CPU && EXPERIMENTAL + ---help--- + Enabling this option offlines CPU0 (if CPU0 can be offlined) as + soon as possible and boots up userspace with CPU0 offlined. User + can online CPU0 back after boot time. + + To debug CPU0 hotplug, you need to enable CPU0 offline/online + feature by either turning on CONFIG_BOOTPARAM_HOTPLUG_CPU0 during + compilation or giving cpu0_hotplug kernel parameter at boot. + + If unsure, say N. + config COMPAT_VDSO def_bool y prompt "Compat VDSO support" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f3b86d0df44e..c026cca5602c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -4,23 +4,24 @@ choice default M686 if X86_32 default GENERIC_CPU if X86_64 -config M386 - bool "386" - depends on X86_32 && !UML +config M486 + bool "486" + depends on X86_32 ---help--- - This is the processor type of your CPU. This information is used for - optimizing purposes. In order to compile a kernel that can run on - all x86 CPU types (albeit not optimally fast), you can specify - "386" here. + This is the processor type of your CPU. This information is + used for optimizing purposes. In order to compile a kernel + that can run on all supported x86 CPU types (albeit not + optimally fast), you can specify "486" here. + + Note that the 386 is no longer supported, this includes + AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI 486DLC/DLC2, + UMC 486SX-S and the NexGen Nx586. The kernel will not necessarily run on earlier architectures than the one you have chosen, e.g. a Pentium optimized kernel will run on a PPro, but not necessarily on a i486. Here are the settings recommended for greatest speed: - - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI - 486DLC/DLC2, and UMC 486SX-S. Only "386" kernels will run on a 386 - class machine. - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - "586" for generic Pentium CPUs lacking the TSC @@ -43,16 +44,7 @@ config M386 - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. - If you don't know what to do, choose "386". - -config M486 - bool "486" - depends on X86_32 - ---help--- - Select this for a 486 series processor, either Intel or one of the - compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, - DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or - U5S. + If you don't know what to do, choose "486". config M586 bool "586/K5/5x86/6x86/6x86MX" @@ -305,24 +297,16 @@ config X86_INTERNODE_CACHE_SHIFT default "12" if X86_VSMP default X86_L1_CACHE_SHIFT -config X86_CMPXCHG - def_bool y - depends on X86_64 || (X86_32 && !M386) - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if MELAN || M486 || M386 || MGEODEGX1 + default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX -config X86_XADD - def_bool y - depends on !M386 - config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" - depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 + depends on M686 || M586MMX || M586TSC || M586 || M486 || MGEODEGX1 ---help--- Old PentiumPro multiprocessor systems had errata that could cause memory operations to violate the x86 ordering standard in rare cases. @@ -335,27 +319,11 @@ config X86_PPRO_FENCE config X86_F00F_BUG def_bool y - depends on M586MMX || M586TSC || M586 || M486 || M386 + depends on M586MMX || M586TSC || M586 || M486 config X86_INVD_BUG def_bool y - depends on M486 || M386 - -config X86_WP_WORKS_OK - def_bool y - depends on !M386 - -config X86_INVLPG - def_bool y - depends on X86_32 && !M386 - -config X86_BSWAP - def_bool y - depends on X86_32 && !M386 - -config X86_POPAD_OK - def_bool y - depends on X86_32 && !M386 + depends on M486 config X86_ALIGNMENT_16 def_bool y @@ -412,12 +380,11 @@ config X86_MINIMUM_CPU_FAMILY default "64" if X86_64 default "6" if X86_32 && X86_P6_NOP default "5" if X86_32 && X86_CMPXCHG64 - default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) - default "3" + default "4" config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486) && !UML menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EXPERT @@ -441,7 +408,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) + depends on M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -495,7 +462,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on M386 || M486 || (EXPERT && !64BIT) + depends on M486 || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for UMC processors diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 86cee7b749e1..6647ed49c66c 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -10,7 +10,6 @@ tune = $(call cc-option,-mcpu=$(1),$(2)) endif align := $(cc-option-align) -cflags-$(CONFIG_M386) += -march=i386 cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 851fe936d242..e3cf9f682be5 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -2,7 +2,6 @@ bootsect bzImage cpustr.h mkcpustr -offsets.h voffset.h zoffset.h setup diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c760e073963e..b1942e222768 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -8,10 +8,13 @@ * ----------------------------------------------------------------------- */ #include <linux/efi.h> +#include <linux/pci.h> #include <asm/efi.h> #include <asm/setup.h> #include <asm/desc.h> +#undef memcpy /* Use memcpy from misc.c */ + #include "eboot.h" static efi_system_table_t *sys_table; @@ -243,6 +246,121 @@ static void find_bits(unsigned long mask, u8 *pos, u8 *size) *size = len; } +static efi_status_t setup_efi_pci(struct boot_params *params) +{ + efi_pci_io_protocol *pci; + efi_status_t status; + void **pci_handle; + efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; + unsigned long nr_pci, size = 0; + int i; + struct setup_data *data; + + data = (struct setup_data *)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)data->next; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &pci_proto, + NULL, &size, pci_handle); + + if (status == EFI_BUFFER_TOO_SMALL) { + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &pci_handle); + + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &pci_proto, + NULL, &size, pci_handle); + } + + if (status != EFI_SUCCESS) + goto free_handle; + + nr_pci = size / sizeof(void *); + for (i = 0; i < nr_pci; i++) { + void *h = pci_handle[i]; + uint64_t attributes; + struct pci_setup_rom *rom; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, &pci_proto, &pci); + + if (status != EFI_SUCCESS) + continue; + + if (!pci) + continue; + + status = efi_call_phys4(pci->attributes, pci, + EfiPciIoAttributeOperationGet, 0, + &attributes); + + if (status != EFI_SUCCESS) + continue; + + if (!attributes & EFI_PCI_IO_ATTRIBUTE_EMBEDDED_ROM) + continue; + + if (!pci->romimage || !pci->romsize) + continue; + + size = pci->romsize + sizeof(*rom); + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &rom); + + if (status != EFI_SUCCESS) + continue; + + rom->data.type = SETUP_PCI; + rom->data.len = size - sizeof(struct setup_data); + rom->data.next = 0; + rom->pcilen = pci->romsize; + + status = efi_call_phys5(pci->pci.read, pci, + EfiPciIoWidthUint16, PCI_VENDOR_ID, + 1, &(rom->vendor)); + + if (status != EFI_SUCCESS) + goto free_struct; + + status = efi_call_phys5(pci->pci.read, pci, + EfiPciIoWidthUint16, PCI_DEVICE_ID, + 1, &(rom->devid)); + + if (status != EFI_SUCCESS) + goto free_struct; + + status = efi_call_phys5(pci->get_location, pci, + &(rom->segment), &(rom->bus), + &(rom->device), &(rom->function)); + + if (status != EFI_SUCCESS) + goto free_struct; + + memcpy(rom->romdata, pci->romimage, pci->romsize); + + if (data) + data->next = (uint64_t)rom; + else + params->hdr.setup_data = (uint64_t)rom; + + data = (struct setup_data *)rom; + + continue; + free_struct: + efi_call_phys1(sys_table->boottime->free_pool, rom); + } + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, pci_handle); + return status; +} + /* * See if we have Graphics Output Protocol */ @@ -1026,6 +1144,8 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table, setup_graphics(boot_params); + setup_efi_pci(boot_params); + status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2a017441b8b2..8c132a625b94 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -476,6 +476,3 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" - - .data -dummy: .long 0 diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 07b3a68d2d29..a703af19c281 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -35,7 +35,7 @@ #undef WARN_OLD #undef CORE_DUMP /* definitely broken */ -static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); +static int load_aout_binary(struct linux_binprm *); static int load_aout_library(struct file *); #ifdef CORE_DUMP @@ -260,9 +260,10 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) * These are the functions used to load a.out style executables and shared * libraries. There is no binary dependent code anywhere else. */ -static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_aout_binary(struct linux_binprm *bprm) { unsigned long error, fd_offset, rlim; + struct pt_regs *regs = current_pt_regs(); struct exec ex; int retval; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 076745fc8045..32e6f05ddaaa 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -467,11 +467,16 @@ GLOBAL(\label) PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx PTREGSCALL stub32_execve, compat_sys_execve, %rcx PTREGSCALL stub32_fork, sys_fork, %rdi - PTREGSCALL stub32_clone, sys32_clone, %rdx PTREGSCALL stub32_vfork, sys_vfork, %rdi PTREGSCALL stub32_iopl, sys_iopl, %rsi ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip),%rax + mov %r8, %rcx + jmp ia32_ptregs_common + + ALIGN ia32_ptregs_common: popq %r11 CFI_ENDPROC diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 86d68d1c8806..d0b689ba7be2 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -385,17 +385,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, - struct pt_regs *regs) -{ - void __user *parent_tid = (void __user *)regs->dx; - void __user *child_tid = (void __user *)regs->di; - - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - /* * Some system calls that need sign extended arguments. This could be * done by a generic wrapper. diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 66e5f0ef0523..79fd8a3418f9 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -12,6 +12,7 @@ header-y += mce.h header-y += msr-index.h header-y += msr.h header-y += mtrr.h +header-y += perf_regs.h header-y += posix_types_32.h header-y += posix_types_64.h header-y += posix_types_x32.h @@ -19,8 +20,10 @@ header-y += prctl.h header-y += processor-flags.h header-y += ptrace-abi.h header-y += sigcontext32.h +header-y += svm.h header-y += ucontext.h header-y += vm86.h +header-y += vmx.h header-y += vsyscall.h genhdr-y += unistd_32.h diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b6c3b821acf6..722aa3b04624 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -172,23 +172,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) */ static inline int atomic_add_return(int i, atomic_t *v) { -#ifdef CONFIG_M386 - int __i; - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ return i + xadd(&v->counter, i); - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - raw_local_irq_save(flags); - __i = atomic_read(v); - atomic_set(v, i + __i); - raw_local_irq_restore(flags); - return i + __i; -#endif } /** diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 2ad874cb661c..92862cd90201 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -13,6 +13,7 @@ #define SETUP_NONE 0 #define SETUP_E820_EXT 1 #define SETUP_DTB 2 +#define SETUP_PCI 3 /* extensible setup data list node */ struct setup_data { diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 0bdbbb3b9ce7..16a57f4ed64d 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -8,6 +8,7 @@ #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ +#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 53f4b219336b..f8bf2eecab86 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -34,9 +34,7 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -#ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 -#endif #ifdef CONFIG_X86_CMPXCHG64 #define cmpxchg64(ptr, o, n) \ @@ -73,59 +71,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) return prev; } -#ifndef CONFIG_X86_CMPXCHG -/* - * Building a kernel capable running on 80386. It may be necessary to - * simulate the cmpxchg on the 80386 CPU. For that purpose we define - * a function for each of the sizes we support. - */ - -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); - -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - switch (size) { - case 1: - return cmpxchg_386_u8(ptr, old, new); - case 2: - return cmpxchg_386_u16(ptr, old, new); - case 4: - return cmpxchg_386_u32(ptr, old, new); - } - return old; -} - -#define cmpxchg(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 3)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - __ret; \ -}) -#define cmpxchg_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 3)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - __ret; \ -}) -#endif - #ifndef CONFIG_X86_CMPXCHG64 /* * Building a kernel capable running on 80386 and 80486. It may be necessary diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/context_tracking.h index d1ac07a23979..1616562683e9 100644 --- a/arch/x86/include/asm/rcu.h +++ b/arch/x86/include/asm/context_tracking.h @@ -1,27 +1,26 @@ -#ifndef _ASM_X86_RCU_H -#define _ASM_X86_RCU_H +#ifndef _ASM_X86_CONTEXT_TRACKING_H +#define _ASM_X86_CONTEXT_TRACKING_H #ifndef __ASSEMBLY__ - -#include <linux/rcupdate.h> +#include <linux/context_tracking.h> #include <asm/ptrace.h> static inline void exception_enter(struct pt_regs *regs) { - rcu_user_exit(); + user_exit(); } static inline void exception_exit(struct pt_regs *regs) { -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING if (user_mode(regs)) - rcu_user_enter(); + user_enter(); #endif } #else /* __ASSEMBLY__ */ -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING # define SCHEDULE_USER call schedule_user #else # define SCHEDULE_USER call schedule diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 4564c8e28a33..5f9a1243190e 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,6 +28,10 @@ struct x86_cpu { #ifdef CONFIG_HOTPLUG_CPU extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); +extern void __cpuinit start_cpu0(void); +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +extern int _debug_hotplug_cpu(int cpu, int action); +#endif #endif DECLARE_PER_CPU(int, cpu_state); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8c297aa53eef..2d9075e863a0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -202,6 +202,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ @@ -311,12 +312,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) - -#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) -# define cpu_has_invlpg 1 -#else -# define cpu_has_invlpg (boot_cpu_data.x86 > 3) -#endif +#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 93e1c55f14ab..03dd72957d2f 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -2,9 +2,6 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#ifdef CONFIG_ACPI - void *acpi_handle; -#endif #ifdef CONFIG_X86_DEV_DMA_OPS struct dma_map_ops *dma_ops; #endif diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c9dcc181d4d1..6e8fdf5ad113 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -35,7 +35,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -89,7 +89,7 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type); + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ @@ -98,6 +98,8 @@ extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_unmap_memmap(void); +extern void efi_memory_uc(u64 addr, unsigned long size); #ifndef CONFIG_EFI /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5939f44fe0c0..9c999c1674fa 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void) return 0; } -/* The first two values are special, do not change. See align_addr() */ +/* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), ALIGN_VA_64 = BIT(1), - ALIGN_VDSO = BIT(2), - ALIGN_TOPDOWN = BIT(3), }; struct va_alignment { @@ -368,5 +366,5 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); +extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4da3c0c4c974..a09c28571064 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -19,6 +19,7 @@ #include <asm/acpi.h> #include <asm/apicdef.h> #include <asm/page.h> +#include <asm/pvclock.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> #include <asm/kmap_types.h> @@ -81,6 +82,10 @@ enum fixed_addresses { VVAR_PAGE, VSYSCALL_HPET, #endif +#ifdef CONFIG_PARAVIRT_CLOCK + PVCLOCK_FIXMAP_BEGIN, + PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, +#endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 831dbb9c6c02..41ab26ea6564 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -399,14 +399,17 @@ static inline void drop_init_fpu(struct task_struct *tsk) typedef struct { int preload; } fpu_switch_t; /* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return new == this_cpu_read_stable(fpu_owner_task) && diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index f373046e63ec..be27ba1e947a 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -55,12 +55,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) - /* Real i386 machines can only support FUTEX_OP_SET */ - if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3) - return -ENOSYS; -#endif - pagefault_disable(); switch (op) { @@ -118,12 +112,6 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) - /* Real i386 machines have no cmpxchg instruction */ - if (boot_cpu_data.x86 == 3) - return -ENOSYS; -#endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 317ff1703d0b..6080d2694bad 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -163,6 +163,9 @@ struct kimage_arch { }; #endif +typedef void crash_vmclear_fn(void); +extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h new file mode 100644 index 000000000000..a92b1763c419 --- /dev/null +++ b/arch/x86/include/asm/kvm_guest.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_GUEST_H +#define _ASM_X86_KVM_GUEST_H + +int kvm_setup_vsyscall_timeinfo(void); + +#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2e11f452435..dc87b65e9c3a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -22,6 +22,8 @@ #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <linux/perf_event.h> +#include <linux/pvclock_gtod.h> +#include <linux/clocksource.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> @@ -442,6 +444,7 @@ struct kvm_vcpu_arch { s8 virtual_tsc_shift; u32 virtual_tsc_mult; u32 virtual_tsc_khz; + s64 ia32_tsc_adjust_msr; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -559,6 +562,12 @@ struct kvm_arch { u64 cur_tsc_write; u64 cur_tsc_offset; u8 cur_tsc_generation; + int nr_vcpus_matched_tsc; + + spinlock_t pvclock_gtod_sync_lock; + bool use_master_clock; + u64 master_kernel_ns; + cycle_t master_cycle_now; struct kvm_xen_hvm_config xen_hvm_config; @@ -612,6 +621,12 @@ struct kvm_vcpu_stat { struct x86_instruction_info; +struct msr_data { + bool host_initiated; + u32 index; + u64 data; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -634,7 +649,7 @@ struct kvm_x86_ops { void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -697,10 +712,11 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); + u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); @@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; @@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index c8bed0da434a..2d89e3980cbd 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -124,27 +124,11 @@ static inline int local_add_negative(long i, local_t *l) */ static inline long local_add_return(long i, local_t *l) { - long __i; -#ifdef CONFIG_M386 - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ - __i = i; + long __i = i; asm volatile(_ASM_XADD "%0, %1;" : "+r" (i), "+m" (l->a.counter) : : "memory"); return i + __i; - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); - __i = local_read(l); - local_set(l, i + __i); - local_irq_restore(flags); - return i + __i; -#endif } static inline long local_sub_return(long i, local_t *l) diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 593e51d4643f..513b05f15bb4 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -3,6 +3,9 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) + #include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 9eae7752ae9b..e3b7819caeef 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,8 +5,6 @@ #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ -#elif defined CONFIG_M386 -#define MODULE_PROC_FAMILY "386 " #elif defined CONFIG_M486 #define MODULE_PROC_FAMILY "486 " #elif defined CONFIG_M586 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 7f0edceb7563..6e930b218724 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -236,6 +236,7 @@ #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_EBC_FREQUENCY_ID 0x0000002c #define MSR_IA32_FEATURE_CONTROL 0x0000003a +#define MSR_IA32_TSC_ADJUST 0x0000003b #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) @@ -337,6 +338,8 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) +#define MSR_IA32_TSC_DEADLINE 0x000006E0 + /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h new file mode 100644 index 000000000000..1c6f7f6212c1 --- /dev/null +++ b/arch/x86/include/asm/numachip/numachip.h @@ -0,0 +1,19 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-specific header file + * + * Copyright (C) 2012 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H +#define _ASM_X86_NUMACHIP_NUMACHIP_H + +extern int __init pci_numachip_init(void); + +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 6e41b9343928..dba7805176bf 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -171,4 +171,16 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[0]; +}; + #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 1104afaba52b..0da5200ee79d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -406,7 +406,6 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#ifndef CONFIG_M386 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) @@ -421,8 +420,6 @@ do { \ #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#endif /* !CONFIG_M386 */ - #ifdef CONFIG_X86_CMPXCHG64 #define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ ({ \ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad1fc8511674..888184b2fc85 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -178,8 +178,6 @@ static inline int hlt_works(int cpu) extern void cpu_detect(struct cpuinfo_x86 *c); -extern struct pt_regs *idle_regs(struct pt_regs *); - extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); @@ -187,7 +185,7 @@ extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); @@ -672,18 +670,29 @@ static inline void sync_core(void) { int tmp; -#if defined(CONFIG_M386) || defined(CONFIG_M486) - if (boot_cpu_data.x86 < 5) - /* There is no speculative execution. - * jmp is a barrier to prefetching. */ - asm volatile("jmp 1f\n1:\n" ::: "memory"); - else +#ifdef CONFIG_M486 + /* + * Do a CPUID if available, otherwise do a jump. The jump + * can conveniently enough be the jump around CPUID. + */ + asm volatile("cmpl %2,%1\n\t" + "jl 1f\n\t" + "cpuid\n" + "1:" + : "=a" (tmp) + : "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1) + : "ebx", "ecx", "edx", "memory"); +#else + /* + * CPUID is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. + */ + asm volatile("cpuid" + : "=a" (tmp) + : "0" (1) + : "ebx", "ecx", "edx", "memory"); #endif - /* cpuid is a barrier to speculative execution. - * Prefetched instructions are automatically - * invalidated when modified. */ - asm volatile("cpuid" : "=a" (tmp) : "0" (1) - : "ebx", "ecx", "edx", "memory"); } static inline void __monitor(const void *eax, unsigned long ecx, diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index dcfde52979c3..54d80fddb739 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -205,21 +205,14 @@ static inline bool user_64bit_mode(struct pt_regs *regs) } #endif -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * This is valid only for kernel mode traps. - */ -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ #ifdef CONFIG_X86_32 - return (unsigned long)(®s->sp); +extern unsigned long kernel_stack_pointer(struct pt_regs *regs); #else +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ return regs->sp; -#endif } +#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) @@ -246,6 +239,15 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, { if (unlikely(offset > MAX_REG_OFFSET)) return 0; +#ifdef CONFIG_X86_32 + /* + * Traps from the kernel do not save sp and ss. + * Use the helper function to retrieve sp. + */ + if (offset == offsetof(struct pt_regs, sp) && + regs->cs == __KERNEL_CS) + return kernel_stack_pointer(regs); +#endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index c59cc97fe6c1..109a9dd5d454 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -6,6 +6,7 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, @@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) return product; } +static __always_inline +u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) +{ + u64 delta = __native_read_tsc() - src->tsc_timestamp; + return pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); +} + +static __always_inline +unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, + cycle_t *cycles, u8 *flags) +{ + unsigned version; + cycle_t ret, offset; + u8 ret_flags; + + version = src->version; + /* Note: emulated platforms which do not advertise SSE2 support + * result in kvmclock not using the necessary RDTSC barriers. + * Without barriers, it is possible that RDTSC instruction reads from + * the time stamp counter outside rdtsc_barrier protected section + * below, resulting in violation of monotonicity. + */ + rdtsc_barrier(); + offset = pvclock_get_nsec_offset(src); + ret = src->system_time + offset; + ret_flags = src->flags; + rdtsc_barrier(); + + *cycles = ret; + *flags = ret_flags; + return version; +} + +struct pvclock_vsyscall_time_info { + struct pvclock_vcpu_time_info pvti; + u32 migrate_count; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size); +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 323973f4abf1..0dba8b7a6ac7 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -260,8 +260,6 @@ struct pt_regs; #endif /* !__i386__ */ -#define ptrace_signal_deliver(regs, cookie) do { } while (0) - #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4f19a1526037..b073aaea747c 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -166,6 +166,7 @@ void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); +void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) diff --git a/arch/x86/include/asm/swab.h b/arch/x86/include/asm/swab.h index 557cd9f00661..7f235c7105c1 100644 --- a/arch/x86/include/asm/swab.h +++ b/arch/x86/include/asm/swab.h @@ -6,22 +6,7 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 val) { -#ifdef __i386__ -# ifdef CONFIG_X86_BSWAP - asm("bswap %0" : "=r" (val) : "0" (val)); -# else - asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ - "rorl $16,%0\n\t" /* swap words */ - "xchgb %b0,%h0" /* swap higher bytes */ - : "=q" (val) - : "0" (val)); -# endif - -#else /* __i386__ */ - asm("bswapl %0" - : "=r" (val) - : "0" (val)); -#endif + asm("bswapl %0" : "=r" (val) : "0" (val)); return val; } #define __arch_swab32 __arch_swab32 @@ -37,22 +22,12 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 val) __u64 u; } v; v.u = val; -# ifdef CONFIG_X86_BSWAP asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -# else - v.s.a = __arch_swab32(v.s.a); - v.s.b = __arch_swab32(v.s.b); - asm("xchgl %0,%1" - : "=r" (v.s.a), "=r" (v.s.b) - : "0" (v.s.a), "1" (v.s.b)); -# endif return v.u; #else /* __i386__ */ - asm("bswapq %0" - : "=r" (val) - : "0" (val)); + asm("bswapq %0" : "=r" (val) : "0" (val)); return val; #endif } diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index a9a8cf3da49d..c76fae4d90be 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -54,8 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); - long sys32_lseek(unsigned int, int, unsigned int); long sys32_kill(int, int); long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 2be0b880417e..2f8374718aa3 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -20,15 +20,6 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); long sys_iopl(unsigned int, struct pt_regs *); -/* kernel/process.c */ -int sys_fork(struct pt_regs *); -int sys_vfork(struct pt_regs *); -long sys_execve(const char __user *, - const char __user *const __user *, - const char __user *const __user *); -long sys_clone(unsigned long, unsigned long, void __user *, - void __user *, struct pt_regs *); - /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 74a44333545a..0fee48e279cc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -56,10 +56,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - if (cpu_has_invlpg) __flush_tlb_single(addr); - else - __flush_tlb(); } #define TLB_FLUSH_ALL -1UL diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h new file mode 100644 index 000000000000..beab86cc282d --- /dev/null +++ b/arch/x86/include/asm/trace_clock.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +#include <linux/compiler.h> +#include <linux/types.h> + +#ifdef CONFIG_X86_TSC + +extern u64 notrace trace_clock_x86_tsc(void); + +# define ARCH_TRACE_CLOCKS \ + { trace_clock_x86_tsc, "x86-tsc", .in_ns = 0 }, + +#else /* !CONFIG_X86_TSC */ + +#define ARCH_TRACE_CLOCKS + +#endif + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 7ccf8d131535..1709801d18ec 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -237,8 +237,6 @@ extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); -#ifdef CONFIG_X86_WP_WORKS_OK - /** * put_user: - Write a simple value into user space. * @x: Value to copy to user space. @@ -326,29 +324,6 @@ do { \ } \ } while (0) -#else - -#define __put_user_size(x, ptr, size, retval, errret) \ -do { \ - __typeof__(*(ptr))__pus_tmp = x; \ - retval = 0; \ - \ - if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \ - retval = errret; \ -} while (0) - -#define put_user(x, ptr) \ -({ \ - int __ret_pu; \ - __typeof__(*(ptr))__pus_tmp = x; \ - __ret_pu = 0; \ - if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \ - sizeof(*(ptr))) != 0)) \ - __ret_pu = -EFAULT; \ - __ret_pu; \ -}) -#endif - #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() #define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() @@ -543,29 +518,12 @@ struct __large_struct { unsigned long buf[100]; }; (x) = (__force __typeof__(*(ptr)))__gue_val; \ } while (0) -#ifdef CONFIG_X86_WP_WORKS_OK - #define put_user_try uaccess_try #define put_user_catch(err) uaccess_catch(err) #define put_user_ex(x, ptr) \ __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#else /* !CONFIG_X86_WP_WORKS_OK */ - -#define put_user_try do { \ - int __uaccess_err = 0; - -#define put_user_catch(err) \ - (err) |= __uaccess_err; \ -} while (0) - -#define put_user_ex(x, ptr) do { \ - __uaccess_err |= __put_user(x, ptr); \ -} while (0) - -#endif /* CONFIG_X86_WP_WORKS_OK */ - extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 16f3fc6ebf2e..0e7dea7d3669 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -51,6 +51,9 @@ # define __ARCH_WANT_SYS_UTIME # define __ARCH_WANT_SYS_WAITPID # define __ARCH_WANT_SYS_EXECVE +# define __ARCH_WANT_SYS_FORK +# define __ARCH_WANT_SYS_VFORK +# define __ARCH_WANT_SYS_CLONE /* * "Conditional" syscalls diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 36ec21c36d68..c2d56b34830d 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -445,8 +445,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) -#define VMX_EPT_AD_BIT (1ull << 21) -#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index eaea1d31f753..80f80955cfd8 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -33,6 +33,26 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +#ifdef CONFIG_X86_64 + +#define VGETCPU_CPU_MASK 0xfff + +static inline unsigned int __getcpu(void) +{ + unsigned int p; + + if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + native_read_tscp(&p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + + return p; +} +#endif /* CONFIG_X86_64 */ + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 59c226d120cd..c20d1ce62dc6 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -359,18 +359,14 @@ HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, return _hypercall4(int, update_va_mapping, va, new_val.pte, new_val.pte >> 32, flags); } +extern int __must_check xen_event_channel_op_compat(int, void *); static inline int HYPERVISOR_event_channel_op(int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct evtchn_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = xen_event_channel_op_compat(cmd, arg); return rc; } @@ -386,17 +382,14 @@ HYPERVISOR_console_io(int cmd, int count, char *str) return _hypercall3(int, console_io, cmd, count, str); } +extern int __must_check HYPERVISOR_physdev_op_compat(int, void *); + static inline int HYPERVISOR_physdev_op(int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct physdev_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = HYPERVISOR_physdev_op_compat(cmd, arg); return rc; } diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 66d0fff1ee84..125f344f06a9 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,7 +33,6 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -/* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 54d52ff1304a..fd9cb7695b5f 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -63,6 +63,7 @@ DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); DEFINE_GUEST_HANDLE(xen_pfn_t); +DEFINE_GUEST_HANDLE(xen_ulong_t); #endif #ifndef HYPERVISOR_VIRT_START diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 91ce48f05f9f..34e923a53762 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -9,7 +9,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg -CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg @@ -62,6 +61,7 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589ac..e48cafcf92ae 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -574,6 +574,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) return irq; } +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); void __init acpi_set_irq_model_pic(void) { diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf65aee..d5e0d717005a 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,6 +101,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b17416e72fbd..b994cc84aa7e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -90,21 +90,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic __initdata; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; @@ -133,6 +118,25 @@ static inline void imcr_apic_to_pic(void) } #endif +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (config_enabled(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (!strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + #ifdef CONFIG_X86_64 static int apic_calibrate_pmtmr __initdata; static __init int setup_apicpmtimer(char *s) @@ -315,6 +319,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 +#define TSC_DIVISOR 32 /* * This function sets up the local APIC timer, with a timeout of @@ -333,6 +338,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) lvtt_value = LOCAL_TIMER_VECTOR; if (!oneshot) lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + if (!lapic_is_integrated()) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -341,6 +349,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) apic_write(APIC_LVTT, lvtt_value); + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); + return; + } + /* * Divide PICLK by 16 */ @@ -453,6 +466,16 @@ static int lapic_next_event(unsigned long delta, return 0; } +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + rdtscll(tsc); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + /* * Setup the lapic timer in periodic or oneshot mode */ @@ -533,7 +556,15 @@ static void __cpuinit setup_APIC_timer(void) memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(levt); + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + (tsc_khz / TSC_DIVISOR) * 1000, + 0xF, ~0UL); + } else + clockevents_register_device(levt); } /* @@ -661,7 +692,9 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ - if (lapic_timer_frequency) { + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + return 0; + } else if (lapic_timer_frequency) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", lapic_timer_frequency); lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, @@ -674,6 +707,9 @@ static int __init calibrate_APIC_clock(void) return 0; } + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + local_irq_disable(); /* Replace the global interrupt handler */ @@ -811,9 +847,6 @@ void __init setup_boot_APIC_clock(void) return; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829ac2b9a..9c2aa89a11cb 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -22,6 +22,7 @@ #include <linux/hardirq.h> #include <linux/delay.h> +#include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> #include <asm/smp.h> #include <asm/apic.h> @@ -179,6 +180,7 @@ static int __init numachip_system_init(void) return 0; x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; map_csrs(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2cd..b739d398bb29 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -234,11 +234,11 @@ int __init arch_early_irq_init(void) zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ if (i < legacy_pic->nr_legacy_irqs) { cfg[i].vector = IRQ0_VECTOR + i; - cpumask_set_cpu(0, cfg[i].domain); + cpumask_setall(cfg[i].domain); } } @@ -1141,7 +1141,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * allocation for the members that are not used anymore. */ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = 1; + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); cpumask_and(cfg->domain, cfg->domain, tmp_mask); break; } @@ -1172,8 +1173,9 @@ next: current_vector = vector; current_offset = offset; if (cfg->vector) { - cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; @@ -1241,12 +1243,6 @@ void __setup_vector_irq(int cpu) cfg = irq_get_chip_data(irq); if (!cfg) continue; - /* - * If it is a legacy IRQ handled by the legacy PIC, this cpu - * will be part of the irq_cfg's domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) - cpumask_set_cpu(cpu, cfg->domain); if (!cpumask_test_cpu(cpu, cfg->domain)) continue; @@ -1356,16 +1352,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC - * can handle this irq and the apic driver is finialized at this point, - * update the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && - cpumask_equal(cfg->domain, cpumask_of(0))) - apic->vector_allocation_domain(0, cfg->domain, - apic->target_cpus()); - if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; @@ -2199,9 +2185,11 @@ static int ioapic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + cpu = cpumask_first_and(cfg->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2257,6 +2245,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); + if (!cfg) + continue; + raw_spin_lock(&desc->lock); /* @@ -3314,8 +3305,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) int ret; if (irq_remapping_enabled) { - if (!setup_hpet_msi_remapped(irq, id)) - return -1; + ret = setup_hpet_msi_remapped(irq, id); + if (ret) + return ret; } ret = msi_compose_msg(NULL, irq, &msg, id); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d12..15239fffd6fe 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -304,7 +304,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); /* get information required for multi-node processors */ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -631,6 +631,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + u64 val; + + if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { + val |= 0x1E; + wrmsrl_safe(0xc0011021, val); + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ @@ -643,12 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) detect_ht(c); #endif - if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } + init_amd_cacheinfo(c); if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -739,9 +748,6 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) { - if (!cpu_has_invlpg) - return; - tlb_flushall_shift = 5; if (c->x86 <= 0x11) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d0e910da16c5..92dfec986a48 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -107,53 +107,17 @@ static void __init check_hlt(void) } /* - * Most 386 processors have a bug where a POPAD can lock the - * machine even from user space. - */ - -static void __init check_popad(void) -{ -#ifndef CONFIG_X86_POPAD_OK - int res, inp = (int) &res; - - pr_info("Checking for popad bug... "); - __asm__ __volatile__( - "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " - : "=&a" (res) - : "d" (inp) - : "ecx", "edi"); - /* - * If this fails, it means that any user program may lock the - * CPU hard. Too bad. - */ - if (res != 12345678) - pr_cont("Buggy\n"); - else - pr_cont("OK\n"); -#endif -} - -/* * Check whether we are able to run this kernel safely on SMP. * - * - In order to run on a i386, we need to be compiled for i386 - * (for due to lack of "invlpg" and working WP on a i386) + * - i386 is no longer supported. * - In order to run on anything without a TSC, we need to be * compiled for a i486. */ static void __init check_config(void) { -/* - * We'd better not be a i386 if we're configured to use some - * i486+ only features! (WP works in supervisor mode and the - * new "invlpg" and "bswap" instructions) - */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ - defined(CONFIG_X86_BSWAP) - if (boot_cpu_data.x86 == 3) + if (boot_cpu_data.x86 < 4) panic("Kernel requires i486+ for 'invlpg' and other features"); -#endif } @@ -166,7 +130,6 @@ void __init check_bugs(void) #endif check_config(); check_hlt(); - check_popad(); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7505f7b13e71..9c3ab43a6954 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1173,15 +1173,6 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* Make sure %fs and %gs are initialized properly in idle threads */ -struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - regs->fs = __KERNEL_PERCPU; - regs->gs = __KERNEL_STACK_CANARY; - - return regs; -} #endif /* CONFIG_X86_64 */ /* @@ -1237,7 +1228,7 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && this_cpu_read(numa_node) == 0 && + if (this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif @@ -1269,8 +1260,7 @@ void __cpuinit cpu_init(void) barrier(); x86_configure_nx(); - if (cpu != 0) - enable_x2apic(); + enable_x2apic(); /* * set up and load the per-CPU TSS diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531a..fcaabd0432c5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -612,10 +612,6 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) { - if (!cpu_has_invlpg) { - tlb_flushall_shift = -1; - return; - } switch ((c->x86 << 8) + c->x86_model) { case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 93c5451bdd52..fe9edec6698a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -538,7 +538,11 @@ __cpuinit cpuid4_cache_lookup_regs(int index, unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - amd_cpuid4(index, &eax, &ebx, &ecx); + if (cpu_has_topoext) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); @@ -557,21 +561,39 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int __cpuinit find_num_cache_leaves(void) +static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx; + unsigned int eax, ebx, ecx, edx, op; union _cpuid4_leaf_eax cache_eax; int i = -1; + if (c->x86_vendor == X86_VENDOR_AMD) + op = 0x8000001d; + else + op = 4; + do { ++i; - /* Do cpuid(4) loop to find out num_cache_leaves */ - cpuid_count(4, i, &eax, &ebx, &ecx, &edx); + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CACHE_TYPE_NULL); return i; } +void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (cpu_has_topoext) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -588,7 +610,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (is_initialized == 0) { /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(); + num_cache_leaves = find_num_cache_leaves(c); is_initialized++; } @@ -728,37 +750,50 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf; - int ret, i, sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); + int i, sibling; - ret = 0; - if (index == 3) { - ret = 1; - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + if (cpu_has_topoext) { + unsigned int apicid, nshared, first, last; + + if (!per_cpu(ici_cpuid4_info, cpu)) + return 0; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { - ret = 1; - for_each_cpu(i, cpu_sibling_mask(cpu)) { + } else if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - } + } else + return 0; - return ret; + return 1; } static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 698b6ec12e0f..1ac581f38dfa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -6,7 +6,7 @@ * * Written by Jacob Shin - AMD, Inc. * - * Support: borislav.petkov@amd.com + * Maintained by: Borislav Petkov <bp@alien8.de> * * April 2006 * - added support for AMD Family 0x10 processors diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 79b2b6b6e613..402c454fbff0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -285,34 +285,39 @@ void cmci_clear(void) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +static long cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); + + return 0; +} + /* * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ void cmci_rediscover(int dying) { - int banks; - int cpu; - cpumask_var_t old; + int cpu, banks; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); for_each_online_cpu(cpu) { if (cpu == dying) continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + + if (cpu == smp_processor_id()) { + cmci_rediscover_work_func(NULL); continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks); - } + } - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + work_on_cpu(cpu, cmci_rediscover_work_func, NULL); + } } /* diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6b96110bb0c3..726bf963c227 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -606,7 +606,7 @@ void __init mtrr_bp_init(void) /* * This is an AMD specific MSR, but we assume(hope?) that - * Intel will implement it to when they extend the address + * Intel will implement it too when they extend the address * bus of the Xeon. */ if (cpuid_eax(0x80000000) >= 0x80000008) { @@ -695,11 +695,16 @@ void mtrr_ap_init(void) } /** - * Save current fixed-range MTRR state of the BSP + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. */ void mtrr_save_state(void) { - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); + int first_cpu; + + get_online_cpus(); + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); + put_online_cpus(); } void set_mtrr_aps_delayed_init(void) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3373f84d1397..4428fd178bce 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -208,12 +208,14 @@ static bool check_hw_exists(void) } /* - * Now write a value and read it back to see if it matches, - * this is needed to detect certain hardware emulators (qemu/kvm) - * that don't trap on the MSR access and always return 0s. + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - val = 0xabcdUL; reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) @@ -1314,6 +1316,121 @@ static struct attribute_group x86_pmu_format_group = { .attrs = NULL, }; +struct perf_pmu_events_attr { + struct device_attribute attr; + u64 id; +}; + +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) +{ + int i, j; + + for (i = 0; attrs[i]; i++) { + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + +static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + + u64 config = x86_pmu.event_map(pmu_attr->id); + return x86_pmu.events_sysfs_show(page, config); +} + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ +}; + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1360,6 +1477,11 @@ static int __init init_hw_perf_events(void) x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1649,6 +1771,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, + &x86_pmu_events_group, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 271d25700297..115c1ea97746 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -354,6 +354,8 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; + ssize_t (*events_sysfs_show)(char *page, u64 config); + /* * CPU Hotplug hooks */ @@ -536,6 +538,9 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4528ae7b6ec4..c93bc4e813a0 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -568,6 +568,14 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -591,6 +599,7 @@ static __initconst const struct x86_pmu amd_pmu = { .put_event_constraints = amd_put_event_constraints, .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 324bb523d9d9..93b9e1181f83 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1603,6 +1603,13 @@ static struct attribute *intel_arch_formats_attr[] = { NULL, }; +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1628,6 +1635,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1766,6 +1774,7 @@ static __initconst const struct x86_pmu intel_pmu = { .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 5df8d32ba91e..3cf3d97cce3a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -118,22 +118,24 @@ static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -156,7 +158,7 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; - u64 count; + u64 count = 0; pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); @@ -603,11 +605,12 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static void snbep_pci2phy_map_init(void) +static int snbep_pci2phy_map_init(void) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid; - u32 config; + int err = 0; + u32 config = 0; while (1) { /* find the UBOX device */ @@ -618,10 +621,14 @@ static void snbep_pci2phy_map_init(void) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; nodeid = config; /* get the Node ID mapping */ - pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; /* * every three bits in the Node ID mapping register maps * to a particular node. @@ -633,7 +640,11 @@ static void snbep_pci2phy_map_init(void) } } }; - return; + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; } /* end of Sandy Bridge-EP uncore support */ @@ -1547,7 +1558,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int port; /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { @@ -1559,7 +1569,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) } /* adjust extra register config */ - port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { case 2: /* shift the 8~15 bits to the 0~7 bits */ @@ -2578,9 +2587,11 @@ static int __init uncore_pci_init(void) switch (boot_cpu_data.x86_model) { case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(); + if (ret) + return ret; pci_uncores = snbep_pci_uncores; uncore_pci_driver = &snbep_uncore_pci_driver; - snbep_pci2phy_map_init(); break; default: return 0; diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 7c46bfdbc373..4b7731bf23a8 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -3,6 +3,8 @@ #include <linux/perf_event.h> #include <linux/types.h> +#include <asm/hardirq.h> + #include "perf_event.h" static const u64 knc_perfmon_event_map[] = @@ -173,30 +175,100 @@ static void knc_pmu_enable_all(int added) static inline void knc_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } static void knc_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -214,7 +286,7 @@ static struct attribute *intel_knc_formats_attr[] = { static __initconst struct x86_pmu knc_pmu = { .name = "knc", - .handle_irq = x86_pmu_handle_irq, + .handle_irq = knc_pmu_handle_irq, .disable_all = knc_pmu_disable_all, .enable_all = knc_pmu_enable_all, .enable = knc_pmu_enable_event, @@ -226,12 +298,11 @@ static __initconst struct x86_pmu knc_pmu = { .event_map = knc_pmu_event_map, .max_events = ARRAY_SIZE(knc_perfmon_event_map), .apic = 1, - .max_period = (1ULL << 31) - 1, + .max_period = (1ULL << 39) - 1, .version = 0, .num_counters = 2, - /* in theory 40 bits, early silicon is buggy though */ - .cntval_bits = 32, - .cntval_mask = (1ULL << 32) - 1, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = knc_event_constraints, .format_attrs = intel_knc_formats_attr, diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index e4dd0f7a0453..f2af39f5dc3d 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -8,13 +8,106 @@ */ static const u64 p6_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static __initconst u64 p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p6_pmu_event_map(int hw_event) @@ -34,7 +127,7 @@ static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ @@ -64,25 +157,25 @@ static void p6_pmu_enable_all(int added) static inline void p6_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ (void)wrmsrl_safe(hwc->config_base, val); } @@ -134,6 +227,8 @@ static __initconst const struct x86_pmu p6_pmu = { .event_constraints = p6_event_constraints, .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + }; __init int p6_pmu_init(void) @@ -158,5 +253,9 @@ __init int p6_pmu_init(void) x86_pmu = p6_pmu; + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad89971d47..74467feb4dc5 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -16,6 +16,7 @@ #include <linux/delay.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/module.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -30,6 +31,27 @@ int in_crash_kexec; +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #endif crash_save_cpu(regs, cpu); + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Disable VMX or SVM if needed. * * We need to disable virtualization on all CPUs. @@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) kdump_nmi_shootdown_cpus(); + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Booting kdump kernel with VMX or SVM enabled won't work, * because (among other limitations) we can't disable paging * with the virt flags. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ed858e9e9a74..df06ade26bef 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1077,6 +1077,9 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } + /* throw away partial pages */ + memblock_trim_memory(PAGE_SIZE); + memblock_dump_all(); } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 88b725aa1d52..c763116c5359 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -739,30 +739,12 @@ ENTRY(ptregs_##name) ; \ ENDPROC(ptregs_##name) PTREGSCALL1(iopl) -PTREGSCALL0(fork) -PTREGSCALL0(vfork) PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) PTREGSCALL2(vm86) PTREGSCALL1(vm86old) -/* Clone is an oddball. The 4th arg is in %edi */ -ENTRY(ptregs_clone) - CFI_STARTPROC - leal 4(%esp),%eax - pushl_cfi %eax - pushl_cfi PT_EDI(%eax) - movl PT_EDX(%eax),%ecx - movl PT_ECX(%eax),%edx - movl PT_EBX(%eax),%eax - call sys_clone - addl $8,%esp - CFI_ADJUST_CFA_OFFSET -8 - ret - CFI_ENDPROC -ENDPROC(ptregs_clone) - .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51f..70641aff0c25 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,7 +56,7 @@ #include <asm/ftrace.h> #include <asm/percpu.h> #include <asm/asm.h> -#include <asm/rcu.h> +#include <asm/context_tracking.h> #include <asm/smap.h> #include <linux/err.h> @@ -845,9 +845,25 @@ ENTRY(\label) END(\label) .endm - PTREGSCALL stub_clone, sys_clone, %r8 - PTREGSCALL stub_fork, sys_fork, %rdi - PTREGSCALL stub_vfork, sys_vfork, %rdi + .macro FORK_LIKE func +ENTRY(stub_\func) + CFI_STARTPROC + popq %r11 /* save return address */ + PARTIAL_FRAME 0 + SAVE_REST + pushq %r11 /* put it back on stack */ + FIXUP_TOP_OF_STACK %r11, 8 + DEFAULT_FRAME 0 8 /* offset 8: return address */ + call sys_\func + RESTORE_TOP_OF_STACK %r11, 8 + ret $REST_SKIP /* pop extended registers */ + CFI_ENDPROC +END(stub_\func) + .endm + + FORK_LIKE clone + FORK_LIKE fork + FORK_LIKE vfork PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx PTREGSCALL stub_iopl, sys_iopl, %rsi @@ -995,8 +1011,8 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - ASM_CLAC XCPT_FRAME + ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): old_rsp-ARGOFFSET */ @@ -1135,8 +1151,8 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC pushq_cfi $~(\num) .Lcommon_\sym: interrupt \do_sym @@ -1190,8 +1206,8 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1208,8 +1224,8 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1227,8 +1243,8 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1247,8 +1263,8 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1266,8 +1282,8 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1699,9 +1715,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1709,8 +1726,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1736,18 +1753,18 @@ first_nmi: * +-------------------------+ * | NMI executing variable | * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ * | pt_regs | * +-------------------------+ * @@ -1763,9 +1780,14 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1786,12 +1808,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1842,8 +1867,12 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + + /* Pop the extra iret frame */ + addq $(5*8), %rsp + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64e..8e7f6556028f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -266,6 +266,19 @@ num_subarch_entries = (. - subarch_entries) / 4 jmp default_entry #endif /* CONFIG_PARAVIRT */ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movl stack_start, %ecx + movl %ecx, %esp + jmp *(initial_code) +ENDPROC(start_cpu0) +#endif + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but @@ -292,8 +305,8 @@ default_entry: * be using the global pages. * * NOTE! If we are on a 486 we may have no cr4 at all! - * Specifically, cr4 exists if and only if CPUID exists, - * which in turn exists if and only if EFLAGS.ID exists. + * Specifically, cr4 exists if and only if CPUID exists + * and has flags other than the FPU flag set. */ movl $X86_EFLAGS_ID,%ecx pushl %ecx @@ -308,6 +321,11 @@ default_entry: testl %ecx,%eax jz 6f # No ID flag = no CPUID = no CR4 + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz 6f # No flags or only CPUID.FPU = no CR4 + movl pa(mmu_cr4_features),%eax movl %eax,%cr4 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 94bf9cc2c7ee..980053c4b9cc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -252,6 +252,22 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movq stack_start(%rip),%rsp + movq initial_code(%rip),%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq +ENDPROC(start_cpu0) +#endif + /* SMP bootup changes these two */ __REFDATA .align 8 diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1460a5df92f7..e28670f9a589 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -434,7 +434,7 @@ void hpet_msi_unmask(struct irq_data *data) /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); - cfg |= HPET_TN_FSB; + cfg |= HPET_TN_ENABLE | HPET_TN_FSB; hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } @@ -445,7 +445,7 @@ void hpet_msi_mask(struct irq_data *data) /* mask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); - cfg &= ~HPET_TN_FSB; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 675a05012449..245a71db401a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -175,7 +175,11 @@ void __cpuinit fpu_init(void) cr0 |= X86_CR0_EM; write_cr0(cr0); - if (!smp_processor_id()) + /* + * init_thread_xstate is only called once to avoid overriding + * xstate_size during boot time or during CPU hotplug. + */ + if (xstate_size == 0) init_thread_xstate(); mxcsr_feature_mask_init(); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4180a874c764..08b973f64032 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,6 +42,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static int kvmclock_vsyscall = 1; +static int parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} + +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token) struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; DEFINE_WAIT(wait); - int cpu, idle; - - cpu = get_cpu(); - idle = idle_cpu(cpu); - put_cpu(); spin_lock(&b->lock); e = _find_apf_task(b, token); @@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = idle || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1; init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -471,6 +476,9 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvmclock_vsyscall) + kvm_setup_vsyscall_timeinfo(); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f1b42b3a186c..220a360010f8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,6 +23,7 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/memblock.h> #include <asm/x86_init.h> #include <asm/reboot.h> @@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; /* @@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void) struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; int low, high; + int cpu; low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); - vcpu_time = &get_cpu_var(hv_clock); + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); - put_cpu_var(hv_clock); + + preempt_enable(); return ts.tv_sec; } @@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; + int cpu; preempt_disable_notrace(); - src = &__get_cpu_var(hv_clock); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; @@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) static unsigned long kvm_get_tsc_khz(void) { struct pvclock_vcpu_time_info *src; - src = &per_cpu(hv_clock, 0); - return pvclock_tsc_khz(src); + int cpu; + unsigned long tsc_khz; + + preempt_disable(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + tsc_khz = pvclock_tsc_khz(src); + preempt_enable(); + return tsc_khz; } static void kvm_get_preset_lpj(void) @@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void) { bool ret = false; struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); - src = &__get_cpu_var(hv_clock); + if (!hv_clock) + return ret; + + src = &hv_clock[cpu].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + src->flags &= ~PVCLOCK_GUEST_STOPPED; ret = true; } @@ -141,9 +160,10 @@ int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; + struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; - high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + low = (int)__pa(src) | 1; + high = ((u64)__pa(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); @@ -197,6 +217,8 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { + unsigned long mem; + if (!kvm_para_available()) return; @@ -209,8 +231,18 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - if (kvm_register_clock("boot clock")) + mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, + PAGE_SIZE); + if (!mem) + return; + hv_clock = __va(mem); + + if (kvm_register_clock("boot clock")) { + hv_clock = NULL; + memblock_free(mem, + sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); return; + } pv_time_ops.sched_clock = kvm_clock_read; x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; @@ -233,3 +265,37 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } + +int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + int cpu; + int ret; + u8 flags; + struct pvclock_vcpu_time_info *vcpu_time; + unsigned int size; + + size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { + preempt_enable(); + return 1; + } + + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { + preempt_enable(); + return ret; + } + + preempt_enable(); + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7720ff5a9ee2..efdec7cd8e01 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -8,8 +8,8 @@ * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * * Maintainers: - * Andreas Herrmann <andreas.herrmann3@amd.com> - * Borislav Petkov <borislav.petkov@amd.com> + * Andreas Herrmann <herrmann.der.user@googlemail.com> + * Borislav Petkov <bp@alien8.de> * * This driver allows to upgrade microcode on F10h AMD * CPUs and later. @@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 switch (c->x86) { case 0x14: @@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, case 0x15: max_size = F15H_MPB_MAX_SIZE; break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b644e1c765dc..2ed787f15bf0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -262,36 +262,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, propagate_user_return_notify(prev_p, next_p); } -int sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -int sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, - NULL, NULL); -} - -long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - /* * Idle related variables and functions */ @@ -306,11 +276,6 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); #endif -static inline int hlt_use_halt(void) -{ - return 1; -} - #ifndef CONFIG_SMP static inline void play_dead(void) { @@ -410,28 +375,22 @@ void cpu_idle(void) */ void default_idle(void) { - if (hlt_use_halt()) { - trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle_rcuidle(1, smp_processor_id()); - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); + trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle_rcuidle(1, smp_processor_id()); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); - if (!need_resched()) - safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - trace_power_end_rcuidle(smp_processor_id()); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - } else { + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else local_irq_enable(); - /* loop is done by the caller */ - cpu_relax(); - } + current_thread_info()->status |= TS_POLLING; + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 44e0bff38e72..b5a8905785e6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -128,8 +128,7 @@ void release_thread(struct task_struct *dead_task) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -138,7 +137,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.sp0 = (unsigned long) (childregs+1); - if (unlikely(!regs)) { + if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); p->thread.ip = (unsigned long) ret_from_kernel_thread; @@ -156,12 +155,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); return 0; } - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->ax = 0; - childregs->sp = sp; + if (sp) + childregs->sp = sp; p->thread.ip = (unsigned long) ret_from_fork; - task_user_gs(p) = get_user_gs(regs); + task_user_gs(p) = get_user_gs(current_pt_regs()); p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 16c6365e2b86..6e68a6194965 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -146,8 +146,7 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { int err; struct pt_regs *childregs; @@ -169,7 +168,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - if (unlikely(!regs)) { + if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->sp = (unsigned long)childregs; @@ -181,10 +180,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; return 0; } - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->ax = 0; - childregs->sp = sp; + if (sp) + childregs->sp = sp; err = -ENOMEM; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a18390..b629bbe0d9bd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> +#include <linux/module.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -166,6 +168,35 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); + unsigned long sp = (unsigned long)®s->sp; + struct thread_info *tinfo; + + if (context == (sp & ~(THREAD_SIZE - 1))) + return sp; + + tinfo = (struct thread_info *)context; + if (tinfo->previous_esp) + return tinfo->previous_esp; + + return (unsigned long)regs; +} +EXPORT_SYMBOL_GPL(kernel_stack_pointer); + static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); @@ -1461,7 +1492,7 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; - rcu_user_exit(); + user_exit(); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1511,6 +1542,13 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) @@ -1527,5 +1565,5 @@ void syscall_trace_leave(struct pt_regs *regs) if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); - rcu_user_enter(); + user_enter(); } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 42eb3300dfc6..85c39590c1a4 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -17,23 +17,13 @@ #include <linux/kernel.h> #include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/bootmem.h> +#include <asm/fixmap.h> #include <asm/pvclock.h> -/* - * These are perodically updated - * xen: magic shared_info page - * kvm: gpa registered via msr - * and then copied here. - */ -struct pvclock_shadow_time { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; - u8 flags; -}; - static u8 valid_flags __read_mostly = 0; void pvclock_set_flags(u8 flags) @@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) -{ - u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, - shadow->tsc_shift); -} - -/* - * Reads a consistent set of time-base values from hypervisor, - * into a shadow data area. - */ -static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, - struct pvclock_vcpu_time_info *src) -{ - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - dst->flags = src->flags; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) || (dst->version != src->version)); - - return dst->version; -} - unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { u64 pv_tsc_khz = 1000000ULL << 32; @@ -88,23 +50,32 @@ void pvclock_resume(void) atomic64_set(&last_value, 0); } +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + cycle_t ret; + u8 flags; + + do { + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); + + return flags & valid_flags; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { - struct pvclock_shadow_time shadow; unsigned version; - cycle_t ret, offset; + cycle_t ret; u64 last; + u8 flags; do { - version = pvclock_get_time_values(&shadow, src); - barrier(); - offset = pvclock_get_nsec_offset(&shadow); - ret = shadow.system_timestamp + offset; - barrier(); - } while (version != src->version); + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && - (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + (flags & PVCLOCK_TSC_STABLE_BIT)) return ret; /* @@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + +#ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + +/* + * Initialize the generic pvclock vsyscall state. This will allocate + * a/some page(s) for the per-vcpu pvclock information, set up a + * fixmap mapping for the page(s) + */ + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size) +{ + int idx; + + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + + pvclock_vdso_info = i; + + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, + __pa_symbol(i) + (idx*PAGE_SIZE), + PAGE_KERNEL_VVAR); + } + + + register_task_migration_notifier(&pvclock_migrate); + + return 0; +} +#endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4929c1be0ac0..801602b5d745 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -195,12 +195,6 @@ void read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } -unsigned long long native_read_tsc(void) -{ - return __native_read_tsc(); -} -EXPORT_SYMBOL(native_read_tsc); - static struct resource rtc_resources[] = { [0] = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 468e98dfd44e..c228322ca180 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -143,11 +143,7 @@ int default_check_phys_apicid_present(int phys_apicid) } #endif -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else struct boot_params boot_params; -#endif /* * Machine setup.. @@ -921,18 +917,19 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + unsigned long start_pfn, end_pfn; - if (ei->addr + ei->size <= 1UL << 32) - continue; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, + NULL) { - if (ei->type == E820_RESERVED) + end = PFN_PHYS(end_pfn); + if (end <= (1UL<<32)) continue; + start = PFN_PHYS(start_pfn); max_pfn_mapped = init_memory_mapping( - ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, - ei->addr + ei->size); + max((1UL<<32), start), end); } /* can we preseve max_low_pfn ?*/ @@ -1048,6 +1045,18 @@ void __init setup_arch(char **cmdline_p) arch_init_ideal_nops(); register_refined_jiffies(CLOCK_TICK_RATE); + +#ifdef CONFIG_EFI + /* Once setup is done above, disable efi_enabled on mismatched + * firmware/kernel archtectures since there is no support for + * runtime services. + */ + if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + efi_enabled = 0; + } +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 70b27ee6118e..fbbb604313a2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/uaccess.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> +#include <linux/context_tracking.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -816,7 +817,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - rcu_user_exit(); + user_exit(); #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ @@ -838,7 +839,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); - rcu_user_enter(); + user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528b..ed0fe385289d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,8 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -125,8 +127,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; /* - * Report back to the Boot Processor. - * Running on AP. + * Report back to the Boot Processor during boot time or to the caller processor + * during CPU online. */ static void __cpuinit smp_callin(void) { @@ -138,15 +140,17 @@ static void __cpuinit smp_callin(void) * we may get here before an INIT-deassert IPI reaches * our local APIC. We have to wait for the IPI or we'll * lock up on an APIC access. + * + * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. */ - if (apic->wait_for_init_deassert) + cpuid = smp_processor_id(); + if (apic->wait_for_init_deassert && cpuid != 0) apic->wait_for_init_deassert(&init_deasserted); /* * (This works even if the APIC is not enabled.) */ phys_id = read_apic_id(); - cpuid = smp_processor_id(); if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); @@ -228,6 +232,8 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } +static int cpu0_logical_apicid; +static int enable_start_cpu0; /* * Activate a secondary processor. */ @@ -243,6 +249,8 @@ notrace static void __cpuinit start_secondary(void *unused) preempt_disable(); smp_callin(); + enable_start_cpu0 = 0; + #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); @@ -279,19 +287,30 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } +void __init smp_store_boot_cpu_info(void) +{ + int id = 0; /* CPU 0 */ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; +} + /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU */ - void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); *c = boot_cpu_data; c->cpu_index = id; - if (id != 0) - identify_secondary_cpu(c); + /* + * During boot time, CPU0 has this setup already. Save the info when + * bringing up AP or offlined CPU0. + */ + identify_secondary_cpu(c); } static bool __cpuinit @@ -313,7 +332,7 @@ do { \ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && @@ -481,7 +500,7 @@ void __inquire_remote_apic(int apicid) * won't ... remember to clear down the APIC, etc later. */ int __cpuinit -wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) +wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -489,7 +508,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); + apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -649,6 +668,63 @@ static void __cpuinit announce_cpu(int cpu, int apicid) node, cpu, apicid); } +static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) + return NMI_HANDLED; + + return NMI_DONE; +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + * + * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS + * boot-strap code which is not a desired behavior for waking up BSP. To + * void the boot-strap code, wake up CPU0 by NMI instead. + * + * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined + * (i.e. physically hot removed and then hot added), NMI won't wake it up. + * We'll change this code in the future to wake up hard offlined CPU0 if + * real platform and request are available. + */ +static int __cpuinit +wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, + int *cpu0_nmi_registered) +{ + int id; + int boot_error; + + /* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ + if (cpu) + return wakeup_secondary_cpu_via_init(apicid, start_ip); + + /* + * Wake up BSP by nmi. + * + * Register a NMI handler to help wake up CPU0. + */ + boot_error = register_nmi_handler(NMI_LOCAL, + wakeup_cpu0_nmi, 0, "wake_cpu0"); + + if (!boot_error) { + enable_start_cpu0 = 1; + *cpu0_nmi_registered = 1; + if (apic->dest_logical == APIC_DEST_LOGICAL) + id = cpu0_logical_apicid; + else + id = apicid; + boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); + } + + return boot_error; +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -664,6 +740,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long boot_error = 0; int timeout; + int cpu0_nmi_registered = 0; /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); @@ -711,13 +788,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) } /* - * Kick the secondary CPU. Use the method in the APIC driver - * if it's defined - or use an INIT boot APIC message otherwise: + * Wake up a CPU in difference cases: + * - Use the method in the APIC driver if it's defined + * Otherwise, + * - Use an INIT boot APIC message for APs or NMI for BSP. */ if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else - boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, + &cpu0_nmi_registered); if (!boot_error) { /* @@ -782,6 +862,13 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) */ smpboot_restore_warm_reset_vector(); } + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + return boot_error; } @@ -795,7 +882,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { pr_err("%s: bad cpu %d\n", __func__, cpu); @@ -818,6 +905,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* the FPU context is blank, nobody can own it */ + __cpu_disable_lazy_restore(cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); @@ -990,7 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* * Setup boot CPU information */ - smp_store_cpu_info(0); /* Final full version of the data */ + smp_store_boot_cpu_info(); /* Final full version of the data */ cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); @@ -1026,6 +1116,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) */ setup_local_APIC(); + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + /* * Enable IO APIC before setting up error vector */ @@ -1214,19 +1309,6 @@ void cpu_disable_common(void) int native_cpu_disable(void) { - int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - clear_local_APIC(); cpu_disable_common(); @@ -1266,6 +1348,14 @@ void play_dead_common(void) local_irq_disable(); } +static bool wakeup_cpu0(void) +{ + if (smp_processor_id() == 0 && enable_start_cpu0) + return true; + + return false; +} + /* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. @@ -1329,6 +1419,11 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); } } @@ -1339,6 +1434,11 @@ static inline void hlt_play_dead(void) while (1) { native_halt(); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); } } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index b4d3c3927dd8..97ef74b88e0f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,37 +21,23 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. - * - * @flags denotes the allocation direction - bottomup or topdown - - * or vDSO; see call sites below. */ -unsigned long align_addr(unsigned long addr, struct file *filp, - enum align_flags flags) +static unsigned long get_align_mask(void) { - unsigned long tmp_addr; - /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) - return addr; + return 0; if (!(current->flags & PF_RANDOMIZE)) - return addr; - - if (!((flags & ALIGN_VDSO) || filp)) - return addr; - - tmp_addr = addr; - - /* - * We need an address which is <= than the original - * one only when in topdown direction. - */ - if (!(flags & ALIGN_TOPDOWN)) - tmp_addr += va_align.mask; + return 0; - tmp_addr &= ~va_align.mask; + return va_align.mask; +} - return tmp_addr; +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + return (addr + align_mask) & ~align_mask; } static int __init control_va_addr_alignment(char *str) @@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; unsigned long begin, end; if (flags & MAP_FIXED) @@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = begin; - } - addr = mm->free_area_cache; - if (addr < begin) - addr = begin; - start_addr = addr; - -full_search: - - addr = align_addr(addr, filp, 0); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (end - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != begin) { - start_addr = addr = begin; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - addr = align_addr(addr, filp, 0); - } + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); } - unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, @@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - addr = align_addr(addr, filp, ALIGN_TOPDOWN); - - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = addr; - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); bottomup: /* @@ -270,14 +188,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - - return addr; + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 76ee97709a00..6e60b5fe2244 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -30,23 +30,110 @@ #include <linux/mmzone.h> #include <linux/init.h> #include <linux/smp.h> +#include <linux/irq.h> #include <asm/cpu.h> static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); #ifdef CONFIG_HOTPLUG_CPU + +#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 +static int cpu0_hotpluggable = 1; +#else +static int cpu0_hotpluggable; +static int __init enable_cpu0_hotplug(char *str) +{ + cpu0_hotpluggable = 1; + return 1; +} + +__setup("cpu0_hotplug", enable_cpu0_hotplug); +#endif + +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +/* + * This function offlines a CPU as early as possible and allows userspace to + * boot up without the CPU. The CPU can be onlined back by user after boot. + * + * This is only called for debugging CPU offline/online feature. + */ +int __ref _debug_hotplug_cpu(int cpu, int action) +{ + struct device *dev = get_cpu_device(cpu); + int ret; + + if (!cpu_is_hotpluggable(cpu)) + return -EINVAL; + + cpu_hotplug_driver_lock(); + + switch (action) { + case 0: + ret = cpu_down(cpu); + if (!ret) { + pr_info("CPU %u is now offline\n", cpu); + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); + } else + pr_debug("Can't offline CPU%d.\n", cpu); + break; + case 1: + ret = cpu_up(cpu); + if (!ret) + kobject_uevent(&dev->kobj, KOBJ_ONLINE); + else + pr_debug("Can't online CPU%d.\n", cpu); + break; + default: + ret = -EINVAL; + } + + cpu_hotplug_driver_unlock(); + + return ret; +} + +static int __init debug_hotplug_cpu(void) +{ + _debug_hotplug_cpu(0, 0); + return 0; +} + +late_initcall_sync(debug_hotplug_cpu); +#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ + int __ref arch_register_cpu(int num) { + struct cpuinfo_x86 *c = &cpu_data(num); + + /* + * Currently CPU0 is only hotpluggable on Intel platforms. Other + * vendors can add hotplug support later. + */ + if (c->x86_vendor != X86_VENDOR_INTEL) + cpu0_hotpluggable = 0; + /* - * CPU0 cannot be offlined due to several - * restrictions and assumptions in kernel. This basically - * doesn't add a control file, one cannot attempt to offline - * BSP. + * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate + * depends on BSP. PIC interrupts depend on BSP. * - * Also certain PCI quirks require not to enable hotplug control - * for all CPU's. + * If the BSP depencies are under control, one can tell kernel to + * enable BSP hotplug. This basically adds a control file and + * one can attempt to offline BSP. */ - if (num) + if (num == 0 && cpu0_hotpluggable) { + unsigned int irq; + /* + * We won't take down the boot processor on i386 if some + * interrupts only are able to be serviced by the BSP in PIC. + */ + for_each_active_irq(irq) { + if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { + cpu0_hotpluggable = 0; + break; + } + } + } + if (num || cpu0_hotpluggable) per_cpu(cpu_devices, num).cpu.hotpluggable = 1; return register_cpu(&per_cpu(cpu_devices, num).cpu, num); diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 000000000000..25b993729f9b --- /dev/null +++ b/arch/x86/kernel/trace_clock.c @@ -0,0 +1,21 @@ +/* + * X86 trace clocks + */ +#include <asm/trace_clock.h> +#include <asm/barrier.h> +#include <asm/msr.h> + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + u64 ret; + + rdtsc_barrier(); + rdtscll(ret); + + return ret; +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8276dc6794cc..eb8586693e0b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,7 +55,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/mce.h> -#include <asm/rcu.h> +#include <asm/context_tracking.h> #include <asm/mach_traps.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfa5d4f7ca56..06ccb5073a3f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -77,6 +77,12 @@ unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + int check_tsc_unstable(void) { return tsc_unstable; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aafa5557b396..c71025b67462 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -478,6 +478,11 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->ip = current->utask->xol_vaddr; pre_xol_rip_insn(auprobe, regs, autask); + autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + return 0; } @@ -603,6 +608,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (auprobe->fixups & UPROBE_FIX_CALL) result = adjust_ret_addr(regs->sp, correction); + /* + * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP + * so we can get an extra SIGTRAP if we do not clear TF. We need + * to examine the opcode to make it right. + */ + if (utask->autask.saved_tf) + send_sig(SIGTRAP, current, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + regs->flags &= ~X86_EFLAGS_TF; + return result; } @@ -647,6 +662,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) current->thread.trap_nr = utask->autask.saved_trap_nr; handle_riprel_post_xol(auprobe, regs, NULL); instruction_pointer_set(regs, utask->vaddr); + + /* clear TF if it was set by us in arch_uprobe_pre_xol() */ + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; } /* @@ -676,38 +695,3 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) send_sig(SIGTRAP, current, 0); return ret; } - -void arch_uprobe_enable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - struct pt_regs *regs = task_pt_regs(task); - - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) - set_task_blockstep(task, false); -} - -void arch_uprobe_disable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); - struct pt_regs *regs = task_pt_regs(task); - /* - * The state of TIF_BLOCKSTEP was not saved so we can get an extra - * SIGTRAP if we do not clear TF. We need to examine the opcode to - * make it right. - */ - if (unlikely(trapped)) { - if (!autask->saved_tf) - regs->flags &= ~X86_EFLAGS_TF; - } else { - if (autask->saved_tf) - send_sig(SIGTRAP, task, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) - regs->flags &= ~X86_EFLAGS_TF; - } -} diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5c9687b1bde6..1dfe69cc78a8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, 0xA0000, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ec79e773342e..a20ecb5b6cbf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (index == 0) { entry->ebx &= kvm_supported_word9_x86_features; cpuid_mask(&entry->ebx, 9); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); } else entry->ebx = 0; entry->eax = 0; @@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } else *eax = *ebx = *ecx = *edx = 0; } +EXPORT_SYMBOL_GPL(kvm_cpuid); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a10e46016851..b7fd07984888 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -24,10 +24,21 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); +} + static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39171cb307ea..a27e76371108 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -426,8 +426,7 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) _ASM_EXTABLE(1b, 3b) \ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ @@ -677,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, addr.seg); if (!usable) goto bad; - /* code segment or read-only data segment */ - if (((desc.type & 8) || !(desc.type & 2)) && write) + /* code segment in protected mode or read-only data segment */ + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) + || !(desc.type & 2)) && write) goto bad; /* unreadable code segment */ if (!fetch && (desc.type & 8) && !(desc.type & 2)) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43e9fadca5d0..9392f527f107 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f85fe0bf958..01d7c2ad05f5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, || (!vcpu->arch.mmu.direct_map && write_fault && !is_write_protection(vcpu) && !user_fault)) { + /* + * There are two cases: + * - the one is other vcpu creates new sp in the window + * between mapping_level() and acquiring mmu-lock. + * - the another case is the new sp is created by itself + * (page-fault path) when guest uses the target gfn as + * its page table. + * Both of these cases can be fixed by allowing guest to + * retry the access, it will refault, then we can establish + * the mapping by using small page. + */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) { - ret = 1; - drop_spte(vcpu->kvm, sptep); + has_wrprotected_page(vcpu->kvm, gfn, level)) goto done; - } spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; +} + static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } +static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) + goto no_present; + + if (!(gpte & PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { @@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } } -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, pfn_t pfn, unsigned access, int *ret_val) { bool ret = true; /* The pfn is invalid, report the error! */ - if (unlikely(is_invalid_pfn(pfn))) { + if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); goto exit; } @@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static inline void protect_clean_gpte(unsigned *access, unsigned gpte) { unsigned mask; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 714e2c01a6fe..891eb6d93b8b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte; unsigned pte_access; + gfn_t gfn; pfn_t pfn; - gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); + + gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) - return; + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log && (pte_access & ACC_WRITE_MASK)); + if (is_error_pfn(pfn)) + return false; /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). + * we call mmu_set_spte() with host_writable = true because + * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); + NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + + return true; +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte) +{ + pt_element_t gpte = *(const pt_element_t *)pte; + + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); } } /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) + pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw->pt_access; + int top_level, emulate = 0; if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; + return 0; direct_access = gw->pte_access; @@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, clear_sp_write_flooding_count(it.sptep); mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, + user_fault, write_fault, &emulate, it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); - return NULL; + return 0; } /* @@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; @@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - + r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return emulate; + return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d017df3899ef..d29d3cd1c156 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -20,6 +20,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include <linux/module.h> #include <linux/mod_devicetable.h> @@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", - me); + pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } sd = per_cpu(svm_data, me); - if (!sd) { - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", - me); + pr_err("%s: svm_data is NULL on %d\n", __func__, me); return -EINVAL; } @@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) svm->tsc_ratio = ratio; } +static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->vmcb->control.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm) static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + u32 dummy; + u32 eax = 1; init_vmcb(svm); @@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); + kvm_register_write(vcpu, VCPU_REGS_RDX, eax); return 0; } @@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); - - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) @@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; -free_page4: - __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, host_tsc); } static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) @@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ecx = msr->index; + u64 data = msr->data; switch (ecx) { case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr); break; case MSR_STAR: svm->vmcb->save.star = data; @@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: - return kvm_set_msr_common(vcpu, ecx, data); + return kvm_set_msr_common(vcpu, msr); } return 0; } static int wrmsr_interception(struct vcpu_svm *svm) { + struct msr_data msr; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) { + if (svm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, .set_tsc_khz = svm_set_tsc_khz, + .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f04dccb..fe5e00ed7036 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -4,6 +4,7 @@ #include <linux/tracepoint.h> #include <asm/vmx.h> #include <asm/svm.h> +#include <asm/clocksource.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -754,6 +755,68 @@ TRACE_EVENT( __entry->write ? "Write" : "Read", __entry->gpa_match ? "GPA" : "GVA") ); + +#ifdef CONFIG_X86_64 + +#define host_clocks \ + {VCLOCK_NONE, "none"}, \ + {VCLOCK_TSC, "tsc"}, \ + {VCLOCK_HPET, "hpet"} \ + +TRACE_EVENT(kvm_update_master_clock, + TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), + TP_ARGS(use_master_clock, host_clock, offset_matched), + + TP_STRUCT__entry( + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + __field( bool, offset_matched ) + ), + + TP_fast_assign( + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + __entry->offset_matched = offset_matched; + ), + + TP_printk("masterclock %d hostclock %s offsetmatched %u", + __entry->use_master_clock, + __print_symbolic(__entry->host_clock, host_clocks), + __entry->offset_matched) +); + +TRACE_EVENT(kvm_track_tsc, + TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, + unsigned int online_vcpus, bool use_master_clock, + unsigned int host_clock), + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, + host_clock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, nr_vcpus_matched_tsc ) + __field( unsigned int, online_vcpus ) + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->nr_vcpus_matched_tsc = nr_matched; + __entry->online_vcpus = online_vcpus; + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + ), + + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" + " hostclock %s", + __entry->vcpu_id, __entry->use_master_clock, + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, + __print_symbolic(__entry->host_clock, host_clocks)) +); + +#endif /* CONFIG_X86_64 */ + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad6b1dd06f8b..9120ae1901e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -42,6 +42,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/kexec.h> #include "trace.h" @@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void) return vmx_capability.ept & VMX_EPT_AD_BIT; } -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; -} - static inline bool cpu_has_vmx_invept_context(void) { return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; @@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ + static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; @@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; + crash_disable_local_vmclear(cpu); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link + * is before setting loaded_vmcs->vcpu to -1 which is done in + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist + * then adds the vmcs into percpu list before it is deleted. + */ + smp_wmb(); + loaded_vmcs_init(loaded_vmcs); + crash_enable_local_vmclear(cpu); } static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (loaded_vmcs->cpu != -1) - smp_call_function_single( - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp) } } -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_individual_addr()) - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, - eptp, gpa); - else - ept_sync_context(eptp); - } -} - static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); + crash_disable_local_vmclear(cpu); + + /* + * Read loaded_vmcs->cpu should be before fetching + * loaded_vmcs->loaded_vmcss_on_cpu_link. + * See the comments in __loaded_vmcs_clear(). + */ + smp_rmb(); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); local_irq_enable(); /* @@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 host_tsc, tsc_offset; + u64 tsc_offset; - rdtscll(host_tsc); tsc_offset = is_guest_mode(vcpu) ? to_vmx(vcpu)->nested.vmcs01_tsc_offset : vmcs_read64(TSC_OFFSET); @@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) WARN(1, "user requested TSC rate below hardware speed\n"); } +static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + return vmcs_read64(TSC_OFFSET); +} + /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; switch (msr_index) { case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr_info); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_TSC_ADJUST: + ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) @@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the loaded_vmcss_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the loaded_vmcss_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { tmp.base = vmcs_readl(sf->base); tmp.selector = vmcs_read16(sf->selector); + tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; tmp.s = 1; } vmx_set_segment(vcpu, &tmp, seg); @@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, * unrestricted guest like Westmere to older host that don't have * unrestricted guest like Nehelem. */ - if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + if (vmx->rmode.vm86_active) { switch (seg) { case VCPU_SREG_CS: vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); @@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); - kvm_write_tsc(&vmx->vcpu, 0); - return 0; } @@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) /* HACK: Don't enable emulation on guest boot/reset */ vmx->emulation_required = 0; -out: return ret; } @@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - return 0; - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); @@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) static int handle_wrmsr(struct kvm_vcpu *vcpu) { + struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (vmx_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (exit_qualification & (1 << 6)) { - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -EINVAL; - } - gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); @@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_TASK_SWITCH)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info " - "(0x%x) and exit reason is 0x%x\n", - __func__, vectoring_info, exit_reason); + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + return 0; + } if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( @@ -6549,19 +6625,22 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } else { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } @@ -7306,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .set_tsc_khz = vmx_set_tsc_khz, + .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, @@ -7364,6 +7444,11 @@ static int __init vmx_init(void) if (r) goto out3; +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -7401,6 +7486,11 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1eefebe5d727..76f54461f7cb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -46,6 +46,8 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/pci.h> +#include <linux/timekeeper_internal.h> +#include <linux/pvclock_gtod.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { @@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) return 1; } else @@ -827,6 +831,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static const u32 emulated_msrs[] = { + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, @@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); + return kvm_x86_ops->set_msr(vcpu, msr); } /* @@ -896,8 +901,62 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return kvm_set_msr(vcpu, index, *data); + struct msr_data msr; + + msr.data = *data; + msr.index = index; + msr.host_initiated = true; + return kvm_set_msr(vcpu, &msr); +} + +#ifdef CONFIG_X86_64 +struct pvclock_gtod_data { + seqcount_t seq; + + struct { /* extract of a clocksource struct */ + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; + + /* open coded 'struct timespec' */ + u64 monotonic_time_snsec; + time_t monotonic_time_sec; +}; + +static struct pvclock_gtod_data pvclock_gtod_data; + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + + write_seqcount_begin(&vdata->seq); + + /* copy pvclock gtod data */ + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->clock->cycle_last; + vdata->clock.mask = tk->clock->mask; + vdata->clock.mult = tk->mult; + vdata->clock.shift = tk->shift; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + write_seqcount_end(&vdata->seq); } +#endif + static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void) return timespec_to_ns(&ts); } +#ifdef CONFIG_X86_64 +static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); +#endif + static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; @@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + bool vcpus_matched; + bool do_request = false; + struct kvm_arch *ka = &vcpu->kvm->arch; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)); + + if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) + if (!ka->use_master_clock) + do_request = 1; + + if (!vcpus_matched && ka->use_master_clock) + do_request = 1; + + if (do_request) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, + atomic_read(&vcpu->kvm->online_vcpus), + ka->use_master_clock, gtod->clock.vclock_mode); +#endif +} + +static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) +{ + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; s64 usdiff; + bool matched; + u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + matched = true; } else { /* * We split periods of matched TSC writes into generations. @@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; + matched = false; pr_debug("kvm: new tsc generation %u, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) + update_ia32_tsc_adjust_msr(vcpu, offset); kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + if (matched) + kvm->arch.nr_vcpus_matched_tsc++; + else + kvm->arch.nr_vcpus_matched_tsc = 0; + + kvm_track_tsc_matching(vcpu); + spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } EXPORT_SYMBOL_GPL(kvm_write_tsc); +#ifdef CONFIG_X86_64 + +static cycle_t read_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = pvclock_gtod_data.clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static inline u64 vgettsc(cycle_t *cycle_now) +{ + long v; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + *cycle_now = read_tsc(); + + v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; + return v * gtod->clock.mult; +} + +static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +{ + unsigned long seq; + u64 ns; + int mode; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + ts->tv_nsec = 0; + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); + + return mode; +} + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +{ + struct timespec ts; + + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) + return false; + + monotonic_to_bootbased(&ts); + *kernel_ns = timespec_to_ns(&ts); + + return true; +} +#endif + +/* + * + * Assuming a stable TSC across physical CPUS, and a stable TSC + * across virtual CPUs, the following condition is possible. + * Each numbered line represents an event visible to both + * CPUs at the next numbered event. + * + * "timespecX" represents host monotonic time. "tscX" represents + * RDTSC value. + * + * VCPU0 on CPU0 | VCPU1 on CPU1 + * + * 1. read timespec0,tsc0 + * 2. | timespec1 = timespec0 + N + * | tsc1 = tsc0 + M + * 3. transition to guest | transition to guest + * 4. ret0 = timespec0 + (rdtsc - tsc0) | + * 5. | ret1 = timespec1 + (rdtsc - tsc1) + * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) + * + * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: + * + * - ret0 < ret1 + * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) + * ... + * - 0 < N - M => M < N + * + * That is, when timespec0 != timespec1, M < N. Unfortunately that is not + * always the case (the difference between two distinct xtime instances + * might be smaller then the difference between corresponding TSC reads, + * when updating guest vcpus pvclock areas). + * + * To avoid that problem, do not allow visibility of distinct + * system_timestamp/tsc_timestamp values simultaneously: use a master + * copy of host monotonic time values. Update that master copy + * in lockstep. + * + * Rely on synchronization of host TSCs and guest TSCs for monotonicity. + * + */ + +static void pvclock_update_vm_gtod_copy(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct kvm_arch *ka = &kvm->arch; + int vclock_mode; + bool host_tsc_clocksource, vcpus_matched; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&kvm->online_vcpus)); + + /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + host_tsc_clocksource = kvm_get_time_and_clockread( + &ka->master_kernel_ns, + &ka->master_cycle_now); + + ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + + if (ka->use_master_clock) + atomic_set(&kvm_guest_has_master_clock, 1); + + vclock_mode = pvclock_gtod_data.clock.vclock_mode; + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, + vcpus_matched); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags; + unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; + struct kvm_arch *ka = &v->kvm->arch; void *shared_kaddr; - unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; + u64 tsc_timestamp, host_tsc; + struct pvclock_vcpu_time_info *guest_hv_clock; u8 pvclock_flags; + bool use_master_clock; + + kernel_ns = 0; + host_tsc = 0; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); - kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); @@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + spin_lock(&ka->pvclock_gtod_sync_lock); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + spin_unlock(&ka->pvclock_gtod_sync_lock); + if (!use_master_clock) { + host_tsc = native_read_tsc(); + kernel_ns = get_kernel_ns(); + } + + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + + /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate @@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - + /* with a master <monotonic time, tsc value> tuple, + * pvclock clock reads always increase at the (scaled) rate + * of guest TSC - no need to deal with sampling errors. + */ + if (!use_master_clock) { + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + } /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - pvclock_flags = 0; - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - - vcpu->hv_clock.flags = pvclock_flags; - /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) shared_kaddr = kmap_atomic(vcpu->time_page); + guest_hv_clock = shared_kaddr + vcpu->time_offset; + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + /* If the host uses TSC clocksource, then it is stable */ + if (use_master_clock) + pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + + vcpu->hv_clock.flags = pvclock_flags; + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); @@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { case MSR_EFER: @@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; + case MSR_IA32_TSC_ADJUST: + if (guest_cpuid_has_tsc_adjust(vcpu)) { + if (!msr_info->host_initiated) { + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_TSCDEADLINE: data = kvm_get_lapic_tscdeadline_msr(vcpu); break; + case MSR_IA32_TSC_ADJUST: + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + /* + * On a host with synchronized TSC, there is no need to update + * kvmclock on vcpu->cpu migration + */ + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!vcpu->arch.apic) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) { - r = PTR_ERR(u.lapic); - goto out; - } + if (IS_ERR(u.lapic)) + return PTR_ERR(u.lapic); r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); - if (r) - goto out; - r = 0; break; } case KVM_INTERRUPT: { @@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&irq, argp, sizeof irq)) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; break; } case KVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); - if (r) - goto out; - r = 0; break; } case KVM_SET_CPUID: { @@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_SET_CPUID2: { @@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_GET_CPUID2: { @@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) { - r = PTR_ERR(u.xsave); - goto out; - } + if (IS_ERR(u.xsave)) + return PTR_ERR(u.xsave); r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) { - r = PTR_ERR(u.xcrs); - goto out; - } + if (IS_ERR(u.xcrs)) + return PTR_ERR(u.xcrs); r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; + return -EINVAL; ret = kvm_x86_ops->set_tss_addr(kvm, addr); return ret; } @@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; break; case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; @@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) goto out; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); - if (r < 0) - goto out; break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); @@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; get_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_SET_IRQCHIP: { @@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; set_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_GET_PIT: { @@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - if (r) - goto out; - r = 0; break; } case KVM_GET_PIT2: { @@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - if (r) - goto out; - r = 0; break; } case KVM_REINJECT_CONTROL: { @@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&control, argp, sizeof(control))) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); - if (r) - goto out; - r = 0; break; } case KVM_XEN_HVM_CONFIG: { @@ -3779,7 +4060,7 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, { struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; - memcpy(vcpu->run->mmio.data, frag->data, frag->len); + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); return X86EMUL_CONTINUE; } @@ -3832,18 +4113,11 @@ mmio: bytes -= handled; val += handled; - while (bytes) { - unsigned now = min(bytes, 8U); - - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; - frag->gpa = gpa; - frag->data = val; - frag->len = now; - - gpa += now; - val += now; - bytes -= now; - } + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = bytes; return X86EMUL_CONTINUE; } @@ -3890,7 +4164,7 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, vcpu->mmio_needed = 1; vcpu->mmio_cur_fragment = 0; - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = gpa; @@ -4280,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct msr_data msr; + + msr.data = data; + msr.index = msr_index; + msr.host_initiated = false; + return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -4502,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) * instruction -> ... */ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - if (!is_error_pfn(pfn)) { + if (!is_error_noslot_pfn(pfn)) { kvm_release_pfn_clean(pfn); return true; } @@ -4888,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } +#ifdef CONFIG_X86_64 +static void pvclock_gtod_update_fn(struct work_struct *work) +{ + struct kvm *kvm; + + struct kvm_vcpu *vcpu; + int i; + + raw_spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + atomic_set(&kvm_guest_has_master_clock, 0); + raw_spin_unlock(&kvm_lock); +} + +static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); + +/* + * Notification about pvclock gtod data update. + */ +static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, + void *priv) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + struct timekeeper *tk = priv; + + update_pvclock_gtod(tk); + + /* disable master clock if host does not trust, or does not + * use, TSC clocksource + */ + if (gtod->clock.vclock_mode != VCLOCK_TSC && + atomic_read(&kvm_guest_has_master_clock) != 0) + queue_work(system_long_wq, &pvclock_gtod_work); + + return 0; +} + +static struct notifier_block pvclock_gtod_notifier = { + .notifier_call = pvclock_gtod_notify, +}; +#endif + int kvm_arch_init(void *opaque) { int r; @@ -4929,6 +5252,10 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); +#ifdef CONFIG_X86_64 + pvclock_gtod_register_notifier(&pvclock_gtod_notifier); +#endif + return 0; out: @@ -4943,6 +5270,9 @@ void kvm_arch_exit(void) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#ifdef CONFIG_X86_64 + pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); +#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -5066,7 +5396,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; @@ -5242,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5254,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); + if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) + kvm_gen_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) @@ -5369,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, + native_read_tsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -5426,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -5522,28 +5878,44 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu) * * read: * for each fragment - * write gpa, len - * exit - * copy data + * for each mmio piece in the fragment + * write gpa, len + * exit + * copy data * execute insn * * write: * for each fragment - * write gpa, len - * copy data - * exit + * for each mmio piece in the fragment + * write gpa, len + * copy data + * exit */ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; + unsigned len; BUG_ON(!vcpu->mmio_needed); /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); if (!vcpu->mmio_is_write) - memcpy(frag->data, run->mmio.data, frag->len); + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; if (vcpu->mmio_is_write) @@ -5551,13 +5923,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->mmio_read_completed = 1; return complete_emulated_io(vcpu); } - /* Initiate next fragment */ - ++frag; + run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - memcpy(run->mmio.data, frag->data, frag->len); - run->mmio.len = frag->len; + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->mmio.len = min(8u, frag->len); run->mmio.is_write = vcpu->mmio_is_write; vcpu->arch.complete_userspace_io = complete_emulated_mmio; return 0; @@ -5773,6 +6144,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -6036,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); @@ -6044,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + int r; + struct msr_data msr; + + r = vcpu_load(vcpu); + if (r) + return r; + msr.data = 0x0; + msr.index = MSR_IA32_TSC; + msr.host_initiated = true; + kvm_write_tsc(vcpu, &msr); + vcpu_put(vcpu); + + return r; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { int r; @@ -6058,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -6081,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -6157,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); } /* @@ -6247,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + r = fx_init(vcpu); + if (r) + goto fail_free_wbinvd_dirty_mask; + + vcpu->arch.ia32_tsc_adjust_msr = 0x0; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); return 0; +fail_free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: @@ -6294,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2b5219c12ac8..e224f7a671b6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index b00f6785da74..96b2c6697c9d 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -32,7 +32,6 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += string_32.o - lib-y += cmpxchg.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c deleted file mode 100644 index 5d619f6df3ee..000000000000 --- a/arch/x86/lib/cmpxchg.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6b34d04d096a..176cca67212b 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -5,91 +5,89 @@ #include <asm/alternative-asm.h> ALIGN -copy_page_c: +copy_page_rep: CFI_STARTPROC - movl $4096/8,%ecx - rep movsq + movl $4096/8, %ecx + rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_c) +ENDPROC(copy_page_rep) -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ +/* + * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. + * Could vary the prefetch distance based on SMP/UP. +*/ ENTRY(copy_page) CFI_STARTPROC - subq $2*8,%rsp + subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 - movq %rbx,(%rsp) + movq %rbx, (%rsp) CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) + movq %r12, 1*8(%rsp) CFI_REL_OFFSET r12, 1*8 - movl $(4096/64)-5,%ecx + movl $(4096/64)-5, %ecx .p2align 4 .Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 prefetcht0 5*64(%rsi) - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi - jnz .Loop64 + jnz .Loop64 - movl $5,%ecx + movl $5, %ecx .p2align 4 .Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi jnz .Loop2 - movq (%rsp),%rbx + movq (%rsp), %rbx CFI_RESTORE rbx - movq 1*8(%rsp),%r12 + movq 1*8(%rsp), %r12 CFI_RESTORE r12 - addq $2*8,%rsp + addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret .Lcopy_page_end: @@ -103,7 +101,7 @@ ENDPROC(copy_page) .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ + .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ 2: .previous .section .altinstructions,"a" diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 98f6d6b68f5a..f0312d746402 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -570,63 +570,6 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { -#ifndef CONFIG_X86_WP_WORKS_OK - if (unlikely(boot_cpu_data.wp_works_ok == 0) && - ((unsigned long)to) < TASK_SIZE) { - /* - * When we are in an atomic section (see - * mm/filemap.c:file_read_actor), return the full - * length to take the slow path. - */ - if (in_atomic()) - return n; - - /* - * CPU does not honor the WP bit when writing - * from supervisory mode, and due to preemption or SMP, - * the page tables can change at any time. - * Do it manually. Manfred <manfred@colorfullife.com> - */ - while (n) { - unsigned long offset = ((unsigned long)to)%PAGE_SIZE; - unsigned long len = PAGE_SIZE - offset; - int retval; - struct page *pg; - void *maddr; - - if (len > n) - len = n; - -survive: - down_read(¤t->mm->mmap_sem); - retval = get_user_pages(current, current->mm, - (unsigned long)to, 1, 1, 0, &pg, NULL); - - if (retval == -ENOMEM && is_global_init(current)) { - up_read(¤t->mm->mmap_sem); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto survive; - } - - if (retval != 1) { - up_read(¤t->mm->mmap_sem); - break; - } - - maddr = kmap_atomic(pg); - memcpy(maddr + offset, from, len); - kunmap_atomic(maddr); - set_page_dirty_lock(pg); - put_page(pg); - up_read(¤t->mm->mmap_sem); - - from += len; - to += len; - n -= len; - } - return n; - } -#endif stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e13ecb41bee..027088f2f7dd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,7 @@ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_START */ -#include <asm/rcu.h> /* exception_enter(), ... */ +#include <asm/context_tracking.h> /* exception_enter(), ... */ /* * Page fault error code bits: @@ -803,20 +803,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, SEGV_ACCERR); } -/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void -out_of_memory(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed): - */ - up_read(¤t->mm->mmap_sem); - - pagefault_out_of_memory(); -} - static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) @@ -879,7 +865,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, return 1; } - out_of_memory(regs, error_code, address); + up_read(¤t->mm->mmap_sem); + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 937bff5cdaa7..ae1aa71d0115 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -274,42 +274,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - - if (len > mm->cached_hole_size) { - start_addr = mm->free_area_cache; - } else { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, @@ -317,83 +290,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long base = mm->mmap_base; - unsigned long addr = addr0; - unsigned long largest_hole = mm->cached_hole_size; - unsigned long start_addr; - - /* don't allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - - if (len <= largest_hole) { - largest_hole = 0; - mm->free_area_cache = base; - } -try_again: - start_addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; - - /* either no address requested or can't fit in requested address hole */ - addr = (mm->free_area_cache - len) & huge_page_mask(h); - do { - /* - * Lookup failure means no vma is above this address, - * i.e. return with success: - */ - vma = find_vma(mm, addr); - if (!vma) - return addr; + struct vm_unmapped_area_info info; + unsigned long addr; - if (addr + len <= vma->vm_start) { - /* remember the address as a hint for next time */ - mm->cached_hole_size = largest_hole; - return (mm->free_area_cache = addr); - } else if (mm->free_area_cache == vma->vm_end) { - /* pull free_area_cache down to the first hole */ - mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); - /* remember the largest hole we saw so far */ - if (addr + largest_hole < vma->vm_start) - largest_hole = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & huge_page_mask(h); - } while (len <= vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != base) { - mm->free_area_cache = base; - largest_hole = 0; - goto try_again; - } /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - addr = hugetlb_get_unmapped_area_bottomup(file, addr0, - len, pgoff, flags); - - /* - * Restore the topdown base: - */ - mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ab1f6a93b527..d7aea41563b3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -35,40 +35,44 @@ struct map_range { unsigned page_size_mask; }; -static void __init find_early_table_space(struct map_range *mr, unsigned long end, - int use_pse, int use_gbpages) +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) { - unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; phys_addr_t base; - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - if (use_pse) { - unsigned long extra; + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); #ifdef CONFIG_X86_32 - extra += PMD_SIZE; + extra += PMD_SIZE; #endif - /* The first 2/4M doesn't use large pages. */ - if (mr->start < PMD_SIZE) - extra += mr->end - mr->start; - - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); #ifdef CONFIG_X86_32 @@ -86,7 +90,7 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - end - 1, pgt_buf_start << PAGE_SHIFT, + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); } @@ -267,7 +271,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(&mr[0], end, use_pse, use_gbpages); + find_early_table_space(mr, nr_range); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 11a58001b4ce..745d66b843c8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -715,10 +715,7 @@ static void __init test_wp_bit(void) if (!boot_cpu_data.wp_works_ok) { printk(KERN_CONT "No.\n"); -#ifdef CONFIG_X86_WP_WORKS_OK - panic( - "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); -#endif + panic("Linux doesn't support CPUs with broken WP."); } else { printk(KERN_CONT "Ok.\n"); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2b6b4a3c8beb..2ead3c8a4c84 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -386,7 +386,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * these mappings are more intelligent. */ if (pte_val(*pte)) { - pages++; + if (!after_bootmem) + pages++; continue; } @@ -451,6 +452,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } @@ -526,6 +529,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } @@ -625,7 +630,9 @@ void __init paging_init(void) * numa support is not compiled in, and later node_set_state * will not set it back. */ - node_clear_state(0, N_NORMAL_MEMORY); + node_clear_state(0, N_MEMORY); + if (N_MEMORY != N_NORMAL_MEMORY) + node_clear_state(0, N_NORMAL_MEMORY); zone_sizes_init(); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d0..217eb705fac0 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -137,7 +137,7 @@ static void pgd_dtor(pgd_t *pgd) * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * -- wli + * -- nyc */ #ifdef CONFIG_X86_PAE diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0777f042e400..13a6b29e2e5d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -104,7 +104,7 @@ static void flush_tlb_func(void *info) return; if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg) + if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); else if (!f->flush_end) __flush_tlb_single(f->flush_start); @@ -197,7 +197,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag == VM_HUGETLB) { + || vmflag & VM_HUGETLB) { local_flush_tlb(); goto flush_all; } @@ -337,10 +337,8 @@ static const struct file_operations fops_tlbflush = { static int __cpuinit create_tlb_flushall_shift(void) { - if (cpu_has_invlpg) { - debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, - arch_debugfs_dir, NULL, &fops_tlbflush); - } + debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); return 0; } late_initcall(create_tlb_flushall_shift); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 520d2bd0b9c5..d11a47099d33 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,6 +11,7 @@ #include <asm/cacheflush.h> #include <linux/netdevice.h> #include <linux/filter.h> +#include <linux/if_vlan.h> /* * Conventions : @@ -212,6 +213,8 @@ void bpf_jit_compile(struct sk_filter *fp) case BPF_S_ANC_MARK: case BPF_S_ANC_RXHASH: case BPF_S_ANC_CPU: + case BPF_S_ANC_VLAN_TAG: + case BPF_S_ANC_VLAN_TAG_PRESENT: case BPF_S_ANC_QUEUE: case BPF_S_LD_W_ABS: case BPF_S_LD_H_ABS: @@ -515,6 +518,24 @@ void bpf_jit_compile(struct sk_filter *fp) CLEAR_A(); #endif break; + case BPF_S_ANC_VLAN_TAG: + case BPF_S_ANC_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + if (is_imm8(offsetof(struct sk_buff, vlan_tci))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, vlan_tci)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, vlan_tci), 4); + } + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + if (filter[i].code == BPF_S_ANC_VLAN_TAG) { + EMIT3(0x80, 0xe4, 0xef); /* and $0xef,%ah */ + } else { + EMIT3(0xc1, 0xe8, 0x0c); /* shr $0xc,%eax */ + EMIT3(0x83, 0xe0, 0x01); /* and $0x1,%eax */ + } + break; case BPF_S_LD_W_ABS: func = CHOOSE_LOAD_FUNC(K, sk_load_word); common_load: seen |= SEEN_DATAREF; diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 3af5a1e79c9c..ee0af58ca5bd 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11) += sta2x11-fixup.o obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += mrst.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 192397c98606..0c01261fe5a8 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,6 +12,7 @@ struct pci_root_info { char name[16]; unsigned int res_num; struct resource *res; + resource_size_t *res_offset; struct pci_sysdata sd; #ifdef CONFIG_PCI_MMCONFIG bool mcfg_added; @@ -22,6 +23,7 @@ struct pci_root_info { }; static bool pci_use_crs = true; +static bool pci_ignore_seg = false; static int __init set_use_crs(const struct dmi_system_id *id) { @@ -35,7 +37,14 @@ static int __init set_nouse_crs(const struct dmi_system_id *id) return 0; } -static const struct dmi_system_id pci_use_crs_table[] __initconst = { +static int __init set_ignore_seg(const struct dmi_system_id *id) +{ + printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident); + pci_ignore_seg = true; + return 0; +} + +static const struct dmi_system_id pci_crs_quirks[] __initconst = { /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ { .callback = set_use_crs, @@ -98,6 +107,16 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), }, }, + + /* https://bugzilla.kernel.org/show_bug.cgi?id=15362 */ + { + .callback = set_ignore_seg, + .ident = "HP xw9300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP xw9300 Workstation"), + }, + }, {} }; @@ -108,7 +127,7 @@ void __init pci_acpi_crs_quirks(void) if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) pci_use_crs = false; - dmi_check_system(pci_use_crs_table); + dmi_check_system(pci_crs_quirks); /* * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that @@ -305,6 +324,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->flags = flags; res->start = start; res->end = end; + info->res_offset[info->res_num] = addr.translation_offset; if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, @@ -374,7 +394,8 @@ static void add_resources(struct pci_root_info *info, "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); else - pci_add_resource(resources, res); + pci_add_resource_offset(resources, res, + info->res_offset[i]); } } @@ -382,6 +403,8 @@ static void free_pci_root_info_res(struct pci_root_info *info) { kfree(info->res); info->res = NULL; + kfree(info->res_offset); + info->res_offset = NULL; info->res_num = 0; } @@ -432,10 +455,20 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, return; size = sizeof(*info->res) * info->res_num; - info->res_num = 0; info->res = kzalloc(size, GFP_KERNEL); - if (!info->res) + if (!info->res) { + info->res_num = 0; + return; + } + + size = sizeof(*info->res_offset) * info->res_num; + info->res_num = 0; + info->res_offset = kzalloc(size, GFP_KERNEL); + if (!info->res_offset) { + kfree(info->res); + info->res = NULL; return; + } acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, info); @@ -455,6 +488,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) int pxm; #endif + if (pci_ignore_seg) + domain = 0; + if (domain && !pci_domains_supported) { printk(KERN_WARNING "pci_bus %04x:%02x: " "ignored (multiple domains not supported)\n", diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 41bd2a2d2c50..b914e20b5a00 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -115,6 +115,16 @@ static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) reg_read(reg, value); } +static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + /* force interrupt pin value to 0 */ + *value = reg->sim_reg.value & 0xfff00ff; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write) DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write) @@ -144,6 +154,7 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write) DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 7, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write) DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write) @@ -161,8 +172,10 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write) + DEFINE_REG(18, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) }; static void __init init_sim_regs(void) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 720e973fc34a..1b1dda90a945 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -17,6 +17,7 @@ #include <asm/io.h> #include <asm/smp.h> #include <asm/pci_x86.h> +#include <asm/setup.h> unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; @@ -608,6 +609,35 @@ unsigned int pcibios_assign_all_busses(void) return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; } +int pcibios_add_device(struct pci_dev *dev) +{ + struct setup_data *data; + struct pci_setup_rom *rom; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = phys_to_virt(pa_data); + + if (data->type == SETUP_PCI) { + rom = (struct pci_setup_rom *)data; + + if ((pci_domain_nr(dev->bus) == rom->segment) && + (dev->bus->number == rom->bus) && + (PCI_SLOT(dev->devfn) == rom->device) && + (PCI_FUNC(dev->devfn) == rom->function) && + (dev->vendor == rom->vendor) && + (dev->device == rom->devid)) { + dev->rom = pa_data + + offsetof(struct pci_setup_rom, romdata); + dev->romlen = rom->pcilen; + } + } + pa_data = data->next; + } + return 0; +} + int pcibios_enable_device(struct pci_dev *dev, int mask) { int err; @@ -626,7 +656,7 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } -int pci_ext_cfg_avail(struct pci_dev *dev) +int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) return 1; diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c new file mode 100644 index 000000000000..7307d9d12d15 --- /dev/null +++ b/arch/x86/pci/numachip.c @@ -0,0 +1,129 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-specific PCI code + * + * Copyright (C) 2012 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + * PCI accessor functions derived from mmconfig_64.c + * + */ + +#include <linux/pci.h> +#include <asm/pci_x86.h> + +static u8 limit __read_mostly; + +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) +{ + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); + + if (cfg && cfg->virt) + return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + return NULL; +} + +static int pci_mmcfg_read_numachip(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + char __iomem *addr; + + /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { +err: *value = -1; + return -EINVAL; + } + + /* Ensure AMD Northbridges don't decode reads to other devices */ + if (unlikely(bus == 0 && devfn >= limit)) { + *value = -1; + return 0; + } + + rcu_read_lock(); + addr = pci_dev_base(seg, bus, devfn); + if (!addr) { + rcu_read_unlock(); + goto err; + } + + switch (len) { + case 1: + *value = mmio_config_readb(addr + reg); + break; + case 2: + *value = mmio_config_readw(addr + reg); + break; + case 4: + *value = mmio_config_readl(addr + reg); + break; + } + rcu_read_unlock(); + + return 0; +} + +static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + char __iomem *addr; + + /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) + return -EINVAL; + + /* Ensure AMD Northbridges don't decode writes to other devices */ + if (unlikely(bus == 0 && devfn >= limit)) + return 0; + + rcu_read_lock(); + addr = pci_dev_base(seg, bus, devfn); + if (!addr) { + rcu_read_unlock(); + return -EINVAL; + } + + switch (len) { + case 1: + mmio_config_writeb(addr + reg, value); + break; + case 2: + mmio_config_writew(addr + reg, value); + break; + case 4: + mmio_config_writel(addr + reg, value); + break; + } + rcu_read_unlock(); + + return 0; +} + +const struct pci_raw_ops pci_mmcfg_numachip = { + .read = pci_mmcfg_read_numachip, + .write = pci_mmcfg_write_numachip, +}; + +int __init pci_numachip_init(void) +{ + int ret = 0; + u32 val; + + /* For remote I/O, restrict bus 0 access to the actual number of AMD + Northbridges, which starts at device number 0x18 */ + ret = raw_pci_read(0, 0, PCI_DEVFN(0x18, 0), 0x60, sizeof(val), &val); + if (ret) + goto out; + + /* HyperTransport fabric size in bits 6:4 */ + limit = PCI_DEVFN(0x18 + ((val >> 4) & 7) + 1, 0); + + /* Use NumaChip PCI accessors for non-extended and extended access */ + raw_pci_ops = raw_pci_ext_ops = &pci_mmcfg_numachip; +out: + return ret; +} diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 4c61b52191eb..f8ab4945892e 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -21,12 +21,25 @@ #include <asm/i8259.h> #include <asm/io.h> #include <asm/io_apic.h> +#include <asm/emergency-restart.h> static int ce4100_i8042_detect(void) { return 0; } +/* + * The CE4100 platform has an internal 8051 Microcontroller which is + * responsible for signaling to the external Power Management Unit the + * intention to reset, reboot or power off the system. This 8051 device has + * its command register mapped at I/O port 0xcf9 and the value 0x4 is used + * to power off the system. + */ +static void ce4100_power_off(void) +{ + outb(0x4, 0xcf9); +} + #ifdef CONFIG_SERIAL_8250 static unsigned int mem_serial_in(struct uart_port *p, int offset) @@ -92,8 +105,11 @@ static void ce4100_serial_fixup(int port, struct uart_port *up, up->membase = (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE); up->membase += up->mapbase & ~PAGE_MASK; + up->mapbase += port * 0x100; + up->membase += port * 0x100; up->iotype = UPIO_MEM32; up->regshift = 2; + up->irq = 4; } #endif up->iobase = 0; @@ -139,8 +155,19 @@ void __init x86_ce4100_early_setup(void) x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.pci.init = ce4100_pci_init; + /* + * By default, the reboot method is ACPI which is supported by the + * CE4100 bootloader CEFDK using FADT.ResetReg Address and ResetValue + * the bootloader will however issue a system power off instead of + * reboot. By using BOOT_KBD we ensure proper system reboot as + * expected. + */ + reboot_type = BOOT_KBD; + #ifdef CONFIG_X86_IO_APIC x86_init.pci.init_irq = sdv_pci_init; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; #endif + + pm_power_off = ce4100_power_off; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index aded2a91162a..ad4439145f85 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,11 +70,15 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; bool efi_64bit; -static bool efi_native; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; +static inline bool efi_is_native(void) +{ + return IS_ENABLED(CONFIG_X86_64) == efi_64bit; +} + static int __init setup_noefi(char *arg) { efi_enabled = 0; @@ -420,7 +424,7 @@ void __init efi_reserve_boot_services(void) } } -static void __init efi_unmap_memmap(void) +void __init efi_unmap_memmap(void) { if (memmap.map) { early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); @@ -432,7 +436,7 @@ void __init efi_free_boot_services(void) { void *p; - if (!efi_native) + if (!efi_is_native()) return; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -684,12 +688,10 @@ void __init efi_init(void) return; } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; - efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) (boot_params.efi_info.efi_systab | ((__u64)boot_params.efi_info.efi_systab_hi<<32)); - efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -723,7 +725,7 @@ void __init efi_init(void) * that doesn't match the kernel 32/64-bit mode. */ - if (!efi_native) + if (!efi_is_native()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else if (efi_runtime_init()) { efi_enabled = 0; @@ -735,7 +737,7 @@ void __init efi_init(void) return; } #ifdef CONFIG_X86_32 - if (efi_native) { + if (efi_is_native()) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } @@ -810,6 +812,16 @@ void __iomem *efi_lookup_mapped_addr(u64 phys_addr) return NULL; } +void efi_memory_uc(u64 addr, unsigned long size) +{ + unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; + u64 npages; + + npages = round_up(size, page_shift) / page_shift; + memrange_efi_to_native(&addr, &npages); + set_memory_uc(addr, npages); +} + /* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that @@ -823,7 +835,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages, end_pfn; + u64 end, systab, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -834,7 +846,7 @@ void __init efi_enter_virtual_mode(void) * non-native EFI */ - if (!efi_native) { + if (!efi_is_native()) { efi_unmap_memmap(); return; } @@ -879,10 +891,14 @@ void __init efi_enter_virtual_mode(void) end_pfn = PFN_UP(end); if (end_pfn <= max_low_pfn_mapped || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) + && end_pfn <= max_pfn_mapped)) { va = __va(md->phys_addr); - else - va = efi_ioremap(md->phys_addr, size, md->type); + + if (!(md->attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)va, size); + } else + va = efi_ioremap(md->phys_addr, size, + md->type, md->attribute); md->virt_addr = (u64) (unsigned long) va; @@ -892,13 +908,6 @@ void __init efi_enter_virtual_mode(void) continue; } - if (!(md->attribute & EFI_MEMORY_WB)) { - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); - } - systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac3aa54e2654..95fd505dfeb6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -82,7 +82,7 @@ void __init efi_call_phys_epilog(void) } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type) + u32 type, u64 attribute) { unsigned long last_map_pfn; @@ -92,8 +92,11 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { unsigned long top = last_map_pfn << PAGE_SHIFT; - efi_ioremap(top, size - (top - phys_addr), type); + efi_ioremap(top, size - (top - phys_addr), type, attribute); } + if (!(attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)__va(phys_addr), size); + return (void __iomem *)__va(phys_addr); } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 218cdb16163c..120cee1c3f8d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -21,6 +21,7 @@ #include <asm/suspend.h> #include <asm/debugreg.h> #include <asm/fpu-internal.h> /* pcntxt_mask */ +#include <asm/cpu.h> #ifdef CONFIG_X86_32 static struct saved_context saved_context; @@ -237,3 +238,84 @@ void restore_processor_state(void) #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); #endif + +/* + * When bsp_check() is called in hibernate and suspend, cpu hotplug + * is disabled already. So it's unnessary to handle race condition between + * cpumask query and cpu hotplug. + */ +static int bsp_check(void) +{ + if (cpumask_first(cpu_online_mask) != 0) { + pr_warn("CPU0 is offline.\n"); + return -ENODEV; + } + + return 0; +} + +static int bsp_pm_callback(struct notifier_block *nb, unsigned long action, + void *ptr) +{ + int ret = 0; + + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + ret = bsp_check(); + break; +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 + case PM_RESTORE_PREPARE: + /* + * When system resumes from hibernation, online CPU0 because + * 1. it's required for resume and + * 2. the CPU was online before hibernation + */ + if (!cpu_online(0)) + _debug_hotplug_cpu(0, 1); + break; + case PM_POST_RESTORE: + /* + * When a resume really happens, this code won't be called. + * + * This code is called only when user space hibernation software + * prepares for snapshot device during boot time. So we just + * call _debug_hotplug_cpu() to restore to CPU0's state prior to + * preparing the snapshot device. + * + * This works for normal boot case in our CPU0 hotplug debug + * mode, i.e. CPU0 is offline and user mode hibernation + * software initializes during boot time. + * + * If CPU0 is online and user application accesses snapshot + * device after boot time, this will offline CPU0 and user may + * see different CPU0 state before and after accessing + * the snapshot device. But hopefully this is not a case when + * user debugging CPU0 hotplug. Even if users hit this case, + * they can easily online CPU0 back. + * + * To simplify this debug code, we only consider normal boot + * case. Otherwise we need to remember CPU0's state and restore + * to that state and resolve racy conditions etc. + */ + _debug_hotplug_cpu(0, 0); + break; +#endif + default: + break; + } + return notifier_from_errno(ret); +} + +static int __init bsp_pm_check_init(void) +{ + /* + * Set this bsp_pm_callback as lower priority than + * cpu_hotplug_pm_callback. So cpu_hotplug_pm_callback will be called + * earlier to disable cpu hotplug before bsp online check. + */ + pm_notifier(bsp_pm_callback, -INT_MAX); + return 0; +} + +core_initcall(bsp_pm_check_init); diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index a47103fbc692..ee3c220ee500 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -8,7 +8,7 @@ # 0 i386 restart_syscall sys_restart_syscall 1 i386 exit sys_exit -2 i386 fork ptregs_fork stub32_fork +2 i386 fork sys_fork stub32_fork 3 i386 read sys_read 4 i386 write sys_write 5 i386 open sys_open compat_sys_open @@ -126,7 +126,7 @@ 117 i386 ipc sys_ipc sys32_ipc 118 i386 fsync sys_fsync 119 i386 sigreturn ptregs_sigreturn stub32_sigreturn -120 i386 clone ptregs_clone stub32_clone +120 i386 clone sys_clone stub32_clone 121 i386 setdomainname sys_setdomainname 122 i386 uname sys_newuname 123 i386 modify_ldt sys_modify_ldt @@ -196,7 +196,7 @@ 187 i386 sendfile sys_sendfile sys32_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork ptregs_vfork stub32_vfork +190 i386 vfork sys_vfork stub32_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff 193 i386 truncate64 sys_truncate64 sys32_truncate64 diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index ddcf39b1a18d..e6773dc8ac41 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -356,7 +356,7 @@ END { exit 1 # print escape opcode map's array print "/* Escape opcode map array */" - print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < geid; i++) for (j = 0; j < max_lprefix; j++) @@ -365,7 +365,7 @@ END { print "};\n" # print group opcode map's array print "/* Group opcode map array */" - print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + print "const insn_attr_t * const inat_group_tables[INAT_GRP_MAX + 1]"\ "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < ggid; i++) for (j = 0; j < max_lprefix; j++) @@ -374,7 +374,7 @@ END { print "};\n" # print AVX opcode map's array print "/* AVX opcode map array */" - print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + print "const insn_attr_t * const inat_avx_tables[X86_VEX_M_MAX + 1]"\ "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < gaid; i++) for (j = 0; j < max_lprefix; j++) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 07611759ce35..983997041963 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -25,13 +25,14 @@ config X86_32 select HAVE_AOUT select ARCH_WANT_IPC_PARSE_VERSION select MODULES_USE_ELF_REL + select CLONE_BACKWARDS config X86_64 def_bool 64BIT select MODULES_USE_ELF_RELA config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD && 64BIT + def_bool 64BIT config RWSEM_GENERIC_SPINLOCK def_bool !RWSEM_XCHGADD_ALGORITHM diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h index ca255a805ed9..bd9a89b67e41 100644 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -1,5 +1,3 @@ -extern long sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid); #ifdef __i386__ #include "syscalls_32.h" #else diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 232e60504b3a..812e98c098e4 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -24,13 +24,10 @@ #define old_mmap sys_old_mmap -#define ptregs_fork sys_fork #define ptregs_iopl sys_iopl #define ptregs_vm86old sys_vm86old -#define ptregs_clone i386_clone #define ptregs_vm86 sys_vm86 #define ptregs_sigaltstack sys_sigaltstack -#define ptregs_vfork sys_vfork #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; #include <asm/syscalls_32.h> diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index db444c7218fe..e8bcea99acdb 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -6,21 +6,6 @@ #include <linux/syscalls.h> #include <sysdep/syscalls.h> -/* - * The prototype on i386 is: - * - * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls - * - * and the "newtls" arg. on i386 is read by copy_thread directly from the - * register saved on the stack. - */ -long i386_clone(unsigned long clone_flags, unsigned long newsp, - int __user *parent_tid, void *newtls, int __user *child_tid) -{ - return sys_clone(clone_flags, newsp, parent_tid, child_tid); -} - - long sys_sigaction(int sig, const struct old_sigaction __user *act, struct old_sigaction __user *oact) { diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 4df6c373421a..205ad328aa52 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,7 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> +#include <asm/pvclock.h> #define gtod (&VVAR(vsyscall_gtod_data)) @@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void) return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); } +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u32 migrate_count; + u8 flags; + unsigned cpu, cpu1; + + + /* + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + migrate_count = pvti->migrate_count; + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) } -notrace static inline u64 vgetsns(void) +notrace static inline u64 vgetsns(int *mode) { long v; cycles_t cycles; @@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void) cycles = vread_tsc(); else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif else return 0; v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; @@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns); diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 5463ad558573..2f94b039e55b 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } + p = __getcpu(); + if (cpu) - *cpu = p & 0xfff; + *cpu = p & VGETCPU_CPU_MASK; if (node) *node = p >> 12; return 0; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 00aaf047b39f..431e87544411 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) * unaligned here as a result of stack start randomization. */ addr = PAGE_ALIGN(addr); - addr = align_addr(addr, NULL, ALIGN_VDSO); + addr = align_vdso_addr(addr); return addr; } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index fdce49c7aff6..131dacd2748a 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,9 @@ config XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK + select XEN_HAVE_PVMMU depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) - depends on X86_CMPXCHG && X86_TSC + depends on X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 586d83812b67..3aeaa933b527 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -223,6 +223,21 @@ static void __init xen_banner(void) version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +/* Check if running on Xen version (major, minor) or later */ +bool +xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} #define CPUID_THERM_POWER_LEAF 6 #define APERFMPERF_PRESENT 0 @@ -287,8 +302,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, static bool __init xen_check_mwait(void) { -#if defined(CONFIG_ACPI) && !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR) && \ - !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR_MODULE) +#ifdef CONFIG_ACPI struct xen_platform_op op = { .cmd = XENPF_set_processor_pminfo, .u.set_pminfo.id = -1, @@ -309,6 +323,13 @@ static bool __init xen_check_mwait(void) if (!xen_initial_domain()) return false; + /* + * When running under platform earlier than Xen4.2, do not expose + * mwait, to avoid the risk of loading native acpi pad driver + */ + if (!xen_running_on_version_or_later(4, 2)) + return false; + ax = 1; cx = 0; @@ -1495,51 +1516,72 @@ asmlinkage void __init xen_start_kernel(void) #endif } -void __ref xen_hvm_init_shared_info(void) +#ifdef CONFIG_XEN_PVHVM +#define HVM_SHARED_INFO_ADDR 0xFE700000UL +static struct shared_info *xen_hvm_shared_info; +static unsigned long xen_hvm_sip_phys; +static int xen_major, xen_minor; + +static void xen_hvm_connect_shared_info(unsigned long pfn) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page = 0; - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = pfn; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; +} +static void __init xen_hvm_set_shared_info(struct shared_info *sip) +{ + int cpu; + + HYPERVISOR_shared_info = sip; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on - * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and - * in that case multiple vcpus might be online. */ - for_each_online_cpu(cpu) { + * HVM. */ + for_each_online_cpu(cpu) per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; +} + +/* Reconnect the shared_info pfn to a (new) mfn */ +void xen_hvm_resume_shared_info(void) +{ + xen_hvm_connect_shared_info(xen_hvm_sip_phys >> PAGE_SHIFT); +} + +/* Xen tools prior to Xen 4 do not provide a E820_Reserved area for guest usage. + * On these old tools the shared info page will be placed in E820_Ram. + * Xen 4 provides a E820_Reserved area at 0xFC000000, and this code expects + * that nothing is mapped up to HVM_SHARED_INFO_ADDR. + * Xen 4.3+ provides an explicit 1MB area at HVM_SHARED_INFO_ADDR which is used + * here for the shared info page. */ +static void __init xen_hvm_init_shared_info(void) +{ + if (xen_major < 4) { + xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); + xen_hvm_sip_phys = __pa(xen_hvm_shared_info); + } else { + xen_hvm_sip_phys = HVM_SHARED_INFO_ADDR; + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_hvm_sip_phys); + xen_hvm_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); } + xen_hvm_connect_shared_info(xen_hvm_sip_phys >> PAGE_SHIFT); + xen_hvm_set_shared_info(xen_hvm_shared_info); } -#ifdef CONFIG_XEN_PVHVM static void __init init_hvm_pv_info(void) { - int major, minor; - uint32_t eax, ebx, ecx, edx, pages, msr, base; + uint32_t ecx, edx, pages, msr, base; u64 pfn; base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); - - major = eax >> 16; - minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", major, minor); - cpuid(base + 2, &pages, &msr, &ecx, &edx); pfn = __pa(hypercall_page); @@ -1590,12 +1632,22 @@ static void __init xen_hvm_guest_init(void) static bool __init xen_hvm_platform(void) { + uint32_t eax, ebx, ecx, edx, base; + if (xen_pv_domain()) return false; - if (!xen_cpuid_base()) + base = xen_cpuid_base(); + if (!base) return false; + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + xen_major = eax >> 16; + xen_minor = eax & 0xffff; + + printk(KERN_INFO "Xen version %d.%d.\n", xen_major, xen_minor); + return true; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6226c99729b9..01de35c77221 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1288,6 +1288,25 @@ unsigned long xen_read_cr2_direct(void) return this_cpu_read(xen_vcpu_info.arch.cr2); } +void xen_flush_tlb_all(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_all(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} static void xen_flush_tlb(void) { struct mmuext_op *op; @@ -2478,8 +2497,10 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long mfn, int nr, - pgprot_t prot, unsigned domid) + xen_pfn_t mfn, int nr, + pgprot_t prot, unsigned domid, + struct page **pages) + { struct remap_data rmd; struct mmu_update mmu_update[REMAP_BATCH_SIZE]; @@ -2518,8 +2539,19 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, err = 0; out: - flush_tlb_all(); + xen_flush_tlb_all(); return err; } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + +/* Returns: 0 success */ +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, + int numpgs, struct page **pages) +{ + if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226e..ae8a00c39de4 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + xen_hvm_resume_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a95b41744ad0..d2e73d19d366 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -40,7 +40,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_init_shared_info(void); +void xen_hvm_resume_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); |