summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig63
-rw-r--r--arch/x86/boot/boot.h18
-rw-r--r--arch/x86/boot/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/head_64.S48
-rw-r--r--arch/x86/boot/header.S10
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S15
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S30
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S23
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c37
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S39
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S38
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S50
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S48
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S35
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S246
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c201
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S8
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S4
-rw-r--r--arch/x86/crypto/salsa20-i586-asm_32.S28
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S28
-rw-r--r--arch/x86/crypto/salsa20_glue.c5
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S35
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S20
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S20
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S10
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S35
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S11
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S20
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S11
-rw-r--r--arch/x86/ia32/ia32_signal.c50
-rw-r--r--arch/x86/ia32/ia32entry.S12
-rw-r--r--arch/x86/ia32/sys_ia32.c171
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/fpu-internal.h5
-rw-r--r--arch/x86/include/asm/ftrace.h24
-rw-r--r--arch/x86/include/asm/ia32.h15
-rw-r--r--arch/x86/include/asm/init.h28
-rw-r--r--arch/x86/include/asm/kexec.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/microcode.h14
-rw-r--r--arch/x86/include/asm/microcode_intel.h85
-rw-r--r--arch/x86/include/asm/mmzone_32.h6
-rw-r--r--arch/x86/include/asm/mwait.h3
-rw-r--r--arch/x86/include/asm/numa.h6
-rw-r--r--arch/x86/include/asm/numa_64.h6
-rw-r--r--arch/x86/include/asm/page.h7
-rw-r--r--arch/x86/include/asm/page_32.h1
-rw-r--r--arch/x86/include/asm/page_64.h36
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_64.h5
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/processor.h27
-rw-r--r--arch/x86/include/asm/proto.h2
-rw-r--r--arch/x86/include/asm/realmode.h3
-rw-r--r--arch/x86/include/asm/signal.h22
-rw-r--r--arch/x86/include/asm/sys_ia32.h16
-rw-r--r--arch/x86/include/asm/syscalls.h13
-rw-r--r--arch/x86/include/asm/thread_info.h1
-rw-r--r--arch/x86/include/asm/tlbflush.h18
-rw-r--r--arch/x86/include/asm/uaccess.h55
-rw-r--r--arch/x86/include/asm/unistd.h2
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/include/asm/xen/events.h3
-rw-r--r--arch/x86/include/asm/xen/page.h2
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h4
-rw-r--r--arch/x86/include/uapi/asm/signal.h8
-rw-r--r--arch/x86/include/uapi/asm/vmx.h9
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/apm_32.c57
-rw-r--r--arch/x86/kernel/cpu/amd.c64
-rw-r--r--arch/x86/kernel/cpu/bugs.c27
-rw-r--r--arch/x86/kernel/cpu/common.c17
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c23
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/entry_32.S45
-rw-r--r--arch/x86/kernel/entry_64.S34
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head32.c20
-rw-r--r--arch/x86/kernel/head64.c144
-rw-r--r--arch/x86/kernel/head_32.S11
-rw-r--r--arch/x86/kernel/head_64.S212
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c1
-rw-r--r--arch/x86/kernel/ioport.c3
-rw-r--r--arch/x86/kernel/kvm.c11
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/microcode_core.c7
-rw-r--r--arch/x86/kernel/microcode_core_early.c76
-rw-r--r--arch/x86/kernel/microcode_intel.c198
-rw-r--r--arch/x86/kernel/microcode_intel_early.c796
-rw-r--r--arch/x86/kernel/microcode_intel_lib.c174
-rw-r--r--arch/x86/kernel/process.c122
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/setup.c273
-rw-r--r--arch/x86/kernel/signal.c184
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/vm86_32.c8
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/kvm/emulate.c673
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/irq.c74
-rw-r--r--arch/x86/kvm/lapic.c140
-rw-r--r--arch/x86/kvm/lapic.h34
-rw-r--r--arch/x86/kvm/mmu.c168
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h106
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/vmx.c714
-rw-r--r--arch/x86/kvm/x86.c184
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/lib/getuser.S43
-rw-r--r--arch/x86/mm/init.c469
-rw-r--r--arch/x86/mm/init_32.c118
-rw-r--r--arch/x86/mm/init_64.c652
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/numa.c43
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/mm/pageattr.c113
-rw-r--r--arch/x86/mm/pat.c4
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/physaddr.c60
-rw-r--r--arch/x86/mm/srat.c125
-rw-r--r--arch/x86/net/bpf_jit_comp.c40
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/common.c1
-rw-r--r--arch/x86/pci/i386.c185
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/platform/efi/efi.c11
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c2
-rw-r--r--arch/x86/power/hibernate_32.c2
-rw-r--r--arch/x86/power/hibernate_64.c66
-rw-r--r--arch/x86/realmode/init.c49
-rw-r--r--arch/x86/syscalls/syscall_32.tbl22
-rw-r--r--arch/x86/syscalls/syscall_64.tbl6
-rw-r--r--arch/x86/um/Kconfig6
-rw-r--r--arch/x86/um/Makefile4
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_32.h5
-rw-r--r--arch/x86/um/signal.c15
-rw-r--r--arch/x86/um/sys_call_table_32.c4
-rw-r--r--arch/x86/um/syscalls_32.c38
-rw-r--r--arch/x86/xen/mmu.c72
-rw-r--r--arch/x86/xen/setup.c5
-rw-r--r--arch/x86/xen/smp.c42
-rw-r--r--arch/x86/xen/spinlock.c1
172 files changed, 5869 insertions, 3392 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 260857a53b87..6a9383370311 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -115,8 +115,10 @@ config X86
select MODULES_USE_ELF_REL if X86_32
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
- select GENERIC_SIGALTSTACK
select ARCH_USE_BUILTIN_BSWAP
+ select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
+ select OLD_SIGACTION if X86_32
+ select COMPAT_OLD_SIGACTION if IA32_EMULATION
config INSTRUCTION_DECODER
def_bool y
@@ -225,7 +227,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config HAVE_INTEL_TXT
def_bool y
- depends on EXPERIMENTAL && INTEL_IOMMU && ACPI
+ depends on INTEL_IOMMU && ACPI
config X86_32_SMP
def_bool y
@@ -469,6 +471,16 @@ config X86_MDFLD
endif
+config X86_INTEL_LPSS
+ bool "Intel Low Power Subsystem Support"
+ depends on ACPI
+ select COMMON_CLK
+ ---help---
+ Select to build support for Intel Low Power Subsystem such as
+ found on Intel Lynxpoint PCH. Selecting this option enables
+ things like clock tree (common clock framework) which are needed
+ by the LPSS peripheral drivers.
+
config X86_RDC321X
bool "RDC R-321x SoC"
depends on X86_32
@@ -632,7 +644,7 @@ config PARAVIRT
config PARAVIRT_SPINLOCKS
bool "Paravirtualization layer for spinlocks"
- depends on PARAVIRT && SMP && EXPERIMENTAL
+ depends on PARAVIRT && SMP
---help---
Paravirtualized spinlocks allow a pvops backend to replace the
spinlock implementation with something virtualization-friendly
@@ -744,7 +756,7 @@ config GART_IOMMU
config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
select SWIOTLB
- depends on X86_64 && PCI && EXPERIMENTAL
+ depends on X86_64 && PCI
---help---
Support for hardware IOMMUs in IBM's xSeries x366 and x460
systems. Needed to run systems with more than 3GB of memory
@@ -786,7 +798,7 @@ config IOMMU_HELPER
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
- depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+ depends on X86_64 && SMP && DEBUG_KERNEL
select CPUMASK_OFFSTACK
---help---
Enable maximum number of CPUS and NUMA Nodes for this architecture.
@@ -1044,6 +1056,24 @@ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
+config MICROCODE_INTEL_LIB
+ def_bool y
+ depends on MICROCODE_INTEL
+
+config MICROCODE_INTEL_EARLY
+ bool "Early load microcode"
+ depends on MICROCODE_INTEL && BLK_DEV_INITRD
+ default y
+ help
+ This option provides functionality to read additional microcode data
+ at the beginning of initrd image. The data tells kernel to load
+ microcode to CPU's as early as possible. No functional change if no
+ microcode data is glued to the initrd, therefore it's safe to say Y.
+
+config MICROCODE_EARLY
+ def_bool y
+ depends on MICROCODE_INTEL_EARLY
+
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
---help---
@@ -1122,7 +1152,6 @@ config HIGHMEM64G
endchoice
choice
- depends on EXPERIMENTAL
prompt "Memory split" if EXPERT
default VMSPLIT_3G
depends on X86_32
@@ -1199,7 +1228,7 @@ config DIRECT_GBPAGES
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
depends on SMP
- depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
+ depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI))
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
---help---
Enable NUMA (Non Uniform Memory Access) support.
@@ -1268,10 +1297,6 @@ config NODES_SHIFT
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accommodate various tables.
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
@@ -1294,7 +1319,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
+ depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1608,8 +1633,7 @@ config CRASH_DUMP
For more details see Documentation/kdump/kdump.txt
config KEXEC_JUMP
- bool "kexec jump (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "kexec jump"
depends on KEXEC && HIBERNATION
---help---
Jump between original kernel and kexeced kernel and invoke
@@ -1714,7 +1738,7 @@ config HOTPLUG_CPU
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
default n
- depends on HOTPLUG_CPU && EXPERIMENTAL
+ depends on HOTPLUG_CPU
---help---
Set whether default state of cpu0_hotpluggable is on or off.
@@ -1743,7 +1767,7 @@ config BOOTPARAM_HOTPLUG_CPU0
config DEBUG_HOTPLUG_CPU0
def_bool n
prompt "Debug CPU0 hotplug"
- depends on HOTPLUG_CPU && EXPERIMENTAL
+ depends on HOTPLUG_CPU
---help---
Enabling this option offlines CPU0 (if CPU0 can be offlined) as
soon as possible and boots up userspace with CPU0 offlined. User
@@ -1927,6 +1951,7 @@ config APM_DO_ENABLE
this feature.
config APM_CPU_IDLE
+ depends on CPU_IDLE
bool "Make CPU Idle calls when idle"
---help---
Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
@@ -2052,7 +2077,7 @@ config PCI_MMCONFIG
config PCI_CNB20LE_QUIRK
bool "Read CNB20LE Host Bridge Windows" if EXPERT
- depends on PCI && EXPERIMENTAL
+ depends on PCI
help
Read the PCI windows out of the CNB20LE host bridge. This allows
PCI hotplug to work on systems with the CNB20LE chipset which do
@@ -2256,8 +2281,8 @@ config IA32_AOUT
Support old a.out binaries in the 32bit emulation.
config X86_X32
- bool "x32 ABI for 64-bit mode (EXPERIMENTAL)"
- depends on X86_64 && IA32_EMULATION && EXPERIMENTAL
+ bool "x32 ABI for 64-bit mode"
+ depends on X86_64 && IA32_EMULATION
---help---
Include code to run binaries for the x32 native 32-bit ABI
for 64-bit processors. An x32 process gets access to the
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 18997e5a1053..5b7531966b84 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -285,16 +285,26 @@ struct biosregs {
void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
/* cmdline.c */
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
{
- return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
}
static inline int cmdline_find_option_bool(const char *option)
{
- return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option_bool(cmd_line_ptr, option);
}
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 6b3b6f708c04..625d21b0cd3f 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,7 +27,7 @@ static inline int myisspace(u8 c)
* Returns the length of the argument (regardless of if it was
* truncated to fit in the buffer), or -1 on not found.
*/
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
{
addr_t cptr;
char c;
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
st_bufcpy /* Copying this to buffer */
} state = st_wordstart;
- if (!cmdline_ptr || cmdline_ptr >= 0x100000)
- return -1; /* No command line, or inaccessible */
+ if (!cmdline_ptr)
+ return -1; /* No command line */
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
@@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
* Returns the position of that option (starts counting with 1)
* or 0 on not found
*/
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
{
addr_t cptr;
char c;
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
st_wordskip, /* Miscompare, skip */
} state = st_wordstart;
- if (!cmdline_ptr || cmdline_ptr >= 0x100000)
- return -1; /* No command line, or inaccessible */
+ if (!cmdline_ptr)
+ return -1; /* No command line */
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 10f6b1178c68..bffd73b45b1f 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
int cmdline_find_option(const char *option, char *buffer, int bufsize)
{
- return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
+ return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
}
int cmdline_find_option_bool(const char *option)
{
- return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+ return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
}
#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f5d1aaa0dec8..c1d383d1fb7e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -37,6 +37,12 @@
__HEAD
.code32
ENTRY(startup_32)
+ /*
+ * 32bit entry is 0 and it is ABI so immutable!
+ * If we come here directly from a bootloader,
+ * kernel(text+data+bss+brk) ramdisk, zero_page, command line
+ * all need to be under the 4G limit.
+ */
cld
/*
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -154,6 +160,12 @@ ENTRY(startup_32)
btsl $_EFER_LME, %eax
wrmsr
+ /* After gdt is loaded */
+ xorl %eax, %eax
+ lldt %ax
+ movl $0x20, %eax
+ ltr %ax
+
/*
* Setup for the jump to 64bit mode
*
@@ -176,28 +188,18 @@ ENTRY(startup_32)
lret
ENDPROC(startup_32)
-no_longmode:
- /* This isn't an x86-64 CPU so hang */
-1:
- hlt
- jmp 1b
-
-#include "../../kernel/verify_cpu.S"
-
- /*
- * Be careful here startup_64 needs to be at a predictable
- * address so I can export it in an ELF header. Bootloaders
- * should look at the ELF header to find this address, as
- * it may change in the future.
- */
.code64
.org 0x200
ENTRY(startup_64)
/*
+ * 64bit entry is 0x200 and it is ABI so immutable!
* We come here either from startup_32 or directly from a
- * 64bit bootloader. If we come here from a bootloader we depend on
- * an identity mapped page table being provied that maps our
- * entire text+data+bss and hopefully all of memory.
+ * 64bit bootloader.
+ * If we come here from a bootloader, kernel(text+data+bss+brk),
+ * ramdisk, zero_page, command line could be above 4G.
+ * We depend on an identity mapped page table being provided
+ * that maps our entire kernel(text+data+bss+brk), zero page
+ * and command line.
*/
#ifdef CONFIG_EFI_STUB
/*
@@ -247,9 +249,6 @@ preferred_addr:
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs
- lldt %ax
- movl $0x20, %eax
- ltr %ax
/*
* Compute the decompressed kernel start address. It is where
@@ -349,6 +348,15 @@ relocated:
*/
jmp *%rbp
+ .code32
+no_longmode:
+ /* This isn't an x86-64 CPU so hang */
+1:
+ hlt
+ jmp 1b
+
+#include "../../kernel/verify_cpu.S"
+
.data
gdt:
.word gdt_end - gdt
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 944ce595f767..9ec06a1f6d61 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -374,6 +374,14 @@ xloadflags:
#else
# define XLF0 0
#endif
+
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
+ /* kernel/boot_param/ramdisk could be loaded above 4g */
+# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
+#else
+# define XLF1 0
+#endif
+
#ifdef CONFIG_EFI_STUB
# ifdef CONFIG_X86_64
# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
@@ -383,7 +391,7 @@ xloadflags:
#else
# define XLF23 0
#endif
- .word XLF0 | XLF23
+ .word XLF0 | XLF1 | XLF23
cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
#added with boot protocol
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0ca7c9ac383..63947a8f9f0f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -52,3 +53,4 @@ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
+crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index b949ec2f9af4..2849dbc59e11 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -36,6 +36,7 @@
.file "aes-i586-asm.S"
.text
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
@@ -219,14 +220,10 @@
// AES (Rijndael) Encryption Subroutine
/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
-.global aes_enc_blk
-
.extern crypto_ft_tab
.extern crypto_fl_tab
-.align 4
-
-aes_enc_blk:
+ENTRY(aes_enc_blk)
push %ebp
mov ctx(%esp),%ebp
@@ -290,18 +287,15 @@ aes_enc_blk:
mov %r0,(%ebp)
pop %ebp
ret
+ENDPROC(aes_enc_blk)
// AES (Rijndael) Decryption Subroutine
/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
-.global aes_dec_blk
-
.extern crypto_it_tab
.extern crypto_il_tab
-.align 4
-
-aes_dec_blk:
+ENTRY(aes_dec_blk)
push %ebp
mov ctx(%esp),%ebp
@@ -365,3 +359,4 @@ aes_dec_blk:
mov %r0,(%ebp)
pop %ebp
ret
+ENDPROC(aes_dec_blk)
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index 5b577d5a059b..910565547163 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -15,6 +15,7 @@
.text
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#define R1 %rax
@@ -49,10 +50,8 @@
#define R11 %r11
#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
- .global FUNC; \
- .type FUNC,@function; \
- .align 8; \
-FUNC: movq r1,r2; \
+ ENTRY(FUNC); \
+ movq r1,r2; \
movq r3,r4; \
leaq KEY+48(r8),r9; \
movq r10,r11; \
@@ -71,14 +70,15 @@ FUNC: movq r1,r2; \
je B192; \
leaq 32(r9),r9;
-#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
+#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \
movq r1,r2; \
movq r3,r4; \
movl r5 ## E,(r9); \
movl r6 ## E,4(r9); \
movl r7 ## E,8(r9); \
movl r8 ## E,12(r9); \
- ret;
+ ret; \
+ ENDPROC(FUNC);
#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
movzbl r2 ## H,r5 ## E; \
@@ -133,7 +133,7 @@ FUNC: movq r1,r2; \
#define entry(FUNC,KEY,B128,B192) \
prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
-#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11)
#define encrypt_round(TAB,OFFSET) \
round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
@@ -151,12 +151,12 @@ FUNC: movq r1,r2; \
/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
- entry(aes_enc_blk,0,enc128,enc192)
+ entry(aes_enc_blk,0,.Le128,.Le192)
encrypt_round(crypto_ft_tab,-96)
encrypt_round(crypto_ft_tab,-80)
-enc192: encrypt_round(crypto_ft_tab,-64)
+.Le192: encrypt_round(crypto_ft_tab,-64)
encrypt_round(crypto_ft_tab,-48)
-enc128: encrypt_round(crypto_ft_tab,-32)
+.Le128: encrypt_round(crypto_ft_tab,-32)
encrypt_round(crypto_ft_tab,-16)
encrypt_round(crypto_ft_tab, 0)
encrypt_round(crypto_ft_tab, 16)
@@ -166,16 +166,16 @@ enc128: encrypt_round(crypto_ft_tab,-32)
encrypt_round(crypto_ft_tab, 80)
encrypt_round(crypto_ft_tab, 96)
encrypt_final(crypto_fl_tab,112)
- return
+ return(aes_enc_blk)
/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
- entry(aes_dec_blk,240,dec128,dec192)
+ entry(aes_dec_blk,240,.Ld128,.Ld192)
decrypt_round(crypto_it_tab,-96)
decrypt_round(crypto_it_tab,-80)
-dec192: decrypt_round(crypto_it_tab,-64)
+.Ld192: decrypt_round(crypto_it_tab,-64)
decrypt_round(crypto_it_tab,-48)
-dec128: decrypt_round(crypto_it_tab,-32)
+.Ld128: decrypt_round(crypto_it_tab,-32)
decrypt_round(crypto_it_tab,-16)
decrypt_round(crypto_it_tab, 0)
decrypt_round(crypto_it_tab, 16)
@@ -185,4 +185,4 @@ dec128: decrypt_round(crypto_it_tab,-32)
decrypt_round(crypto_it_tab, 80)
decrypt_round(crypto_it_tab, 96)
decrypt_final(crypto_il_tab,112)
- return
+ return(aes_dec_blk)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 3470624d7835..04b797767b9e 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1262,7 +1262,6 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
-
ENTRY(aesni_gcm_dec)
push %r12
push %r13
@@ -1437,6 +1436,7 @@ _return_T_done_decrypt:
pop %r13
pop %r12
ret
+ENDPROC(aesni_gcm_dec)
/*****************************************************************************
@@ -1700,10 +1700,12 @@ _return_T_done_encrypt:
pop %r13
pop %r12
ret
+ENDPROC(aesni_gcm_enc)
#endif
+.align 4
_key_expansion_128:
_key_expansion_256a:
pshufd $0b11111111, %xmm1, %xmm1
@@ -1715,6 +1717,8 @@ _key_expansion_256a:
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
ret
+ENDPROC(_key_expansion_128)
+ENDPROC(_key_expansion_256a)
.align 4
_key_expansion_192a:
@@ -1739,6 +1743,7 @@ _key_expansion_192a:
movaps %xmm1, 0x10(TKEYP)
add $0x20, TKEYP
ret
+ENDPROC(_key_expansion_192a)
.align 4
_key_expansion_192b:
@@ -1758,6 +1763,7 @@ _key_expansion_192b:
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
ret
+ENDPROC(_key_expansion_192b)
.align 4
_key_expansion_256b:
@@ -1770,6 +1776,7 @@ _key_expansion_256b:
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
ret
+ENDPROC(_key_expansion_256b)
/*
* int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -1882,6 +1889,7 @@ ENTRY(aesni_set_key)
popl KEYP
#endif
ret
+ENDPROC(aesni_set_key)
/*
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
@@ -1903,6 +1911,7 @@ ENTRY(aesni_enc)
popl KEYP
#endif
ret
+ENDPROC(aesni_enc)
/*
* _aesni_enc1: internal ABI
@@ -1960,6 +1969,7 @@ _aesni_enc1:
movaps 0x70(TKEYP), KEY
AESENCLAST KEY STATE
ret
+ENDPROC(_aesni_enc1)
/*
* _aesni_enc4: internal ABI
@@ -2068,6 +2078,7 @@ _aesni_enc4:
AESENCLAST KEY STATE3
AESENCLAST KEY STATE4
ret
+ENDPROC(_aesni_enc4)
/*
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
@@ -2090,6 +2101,7 @@ ENTRY(aesni_dec)
popl KEYP
#endif
ret
+ENDPROC(aesni_dec)
/*
* _aesni_dec1: internal ABI
@@ -2147,6 +2159,7 @@ _aesni_dec1:
movaps 0x70(TKEYP), KEY
AESDECLAST KEY STATE
ret
+ENDPROC(_aesni_dec1)
/*
* _aesni_dec4: internal ABI
@@ -2255,6 +2268,7 @@ _aesni_dec4:
AESDECLAST KEY STATE3
AESDECLAST KEY STATE4
ret
+ENDPROC(_aesni_dec4)
/*
* void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
@@ -2312,6 +2326,7 @@ ENTRY(aesni_ecb_enc)
popl LEN
#endif
ret
+ENDPROC(aesni_ecb_enc)
/*
* void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
@@ -2370,6 +2385,7 @@ ENTRY(aesni_ecb_dec)
popl LEN
#endif
ret
+ENDPROC(aesni_ecb_dec)
/*
* void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
@@ -2411,6 +2427,7 @@ ENTRY(aesni_cbc_enc)
popl IVP
#endif
ret
+ENDPROC(aesni_cbc_enc)
/*
* void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
@@ -2501,6 +2518,7 @@ ENTRY(aesni_cbc_dec)
popl IVP
#endif
ret
+ENDPROC(aesni_cbc_dec)
#ifdef __x86_64__
.align 16
@@ -2527,6 +2545,7 @@ _aesni_inc_init:
MOVQ_R64_XMM TCTR_LOW INC
MOVQ_R64_XMM CTR TCTR_LOW
ret
+ENDPROC(_aesni_inc_init)
/*
* _aesni_inc: internal ABI
@@ -2555,6 +2574,7 @@ _aesni_inc:
movaps CTR, IV
PSHUFB_XMM BSWAP_MASK IV
ret
+ENDPROC(_aesni_inc)
/*
* void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
@@ -2615,4 +2635,5 @@ ENTRY(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
ret
+ENDPROC(aesni_ctr_enc)
#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1b9c22bea8a7..a0795da22c02 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -40,10 +40,6 @@
#include <linux/workqueue.h>
#include <linux/spinlock.h>
-#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
-#define HAS_CTR
-#endif
-
#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
#define HAS_PCBC
#endif
@@ -395,12 +391,6 @@ static int ablk_ctr_init(struct crypto_tfm *tfm)
return ablk_init_common(tfm, "__driver-ctr-aes-aesni");
}
-#ifdef HAS_CTR
-static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
-{
- return ablk_init_common(tfm, "rfc3686(__driver-ctr-aes-aesni)");
-}
-#endif
#endif
#ifdef HAS_PCBC
@@ -1158,33 +1148,6 @@ static struct crypto_alg aesni_algs[] = { {
.maxauthsize = 16,
},
},
-#ifdef HAS_CTR
-}, {
- .cra_name = "rfc3686(ctr(aes))",
- .cra_driver_name = "rfc3686-ctr-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_rfc3686_ctr_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE +
- CTR_RFC3686_NONCE_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE +
- CTR_RFC3686_NONCE_SIZE,
- .ivsize = CTR_RFC3686_IV_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- .geniv = "seqiv",
- },
- },
-#endif
#endif
#ifdef HAS_PCBC
}, {
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 391d245dc086..246c67006ed0 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -20,6 +20,8 @@
*
*/
+#include <linux/linkage.h>
+
.file "blowfish-x86_64-asm.S"
.text
@@ -116,11 +118,7 @@
bswapq RX0; \
xorq RX0, (RIO);
-.align 8
-.global __blowfish_enc_blk
-.type __blowfish_enc_blk,@function;
-
-__blowfish_enc_blk:
+ENTRY(__blowfish_enc_blk)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -148,19 +146,16 @@ __blowfish_enc_blk:
movq %r10, RIO;
test %cl, %cl;
- jnz __enc_xor;
+ jnz .L__enc_xor;
write_block();
ret;
-__enc_xor:
+.L__enc_xor:
xor_block();
ret;
+ENDPROC(__blowfish_enc_blk)
-.align 8
-.global blowfish_dec_blk
-.type blowfish_dec_blk,@function;
-
-blowfish_dec_blk:
+ENTRY(blowfish_dec_blk)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -189,6 +184,7 @@ blowfish_dec_blk:
movq %r11, %rbp;
ret;
+ENDPROC(blowfish_dec_blk)
/**********************************************************************
4-way blowfish, four blocks parallel
@@ -300,11 +296,7 @@ blowfish_dec_blk:
bswapq RX3; \
xorq RX3, 24(RIO);
-.align 8
-.global __blowfish_enc_blk_4way
-.type __blowfish_enc_blk_4way,@function;
-
-__blowfish_enc_blk_4way:
+ENTRY(__blowfish_enc_blk_4way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -336,7 +328,7 @@ __blowfish_enc_blk_4way:
movq %r11, RIO;
test %bpl, %bpl;
- jnz __enc_xor4;
+ jnz .L__enc_xor4;
write_block4();
@@ -344,18 +336,15 @@ __blowfish_enc_blk_4way:
popq %rbp;
ret;
-__enc_xor4:
+.L__enc_xor4:
xor_block4();
popq %rbx;
popq %rbp;
ret;
+ENDPROC(__blowfish_enc_blk_4way)
-.align 8
-.global blowfish_dec_blk_4way
-.type blowfish_dec_blk_4way,@function;
-
-blowfish_dec_blk_4way:
+ENTRY(blowfish_dec_blk_4way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -387,4 +376,4 @@ blowfish_dec_blk_4way:
popq %rbp;
ret;
-
+ENDPROC(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index 2306d2e4816f..cfc163469c71 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -15,6 +15,8 @@
* http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
*/
+#include <linux/linkage.h>
+
#define CAMELLIA_TABLE_BYTE_LEN 272
/* struct camellia_ctx: */
@@ -190,6 +192,7 @@ roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
%rcx, (%r9));
ret;
+ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
@@ -197,6 +200,7 @@ roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
%xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
%rax, (%r9));
ret;
+ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
* IN/OUT:
@@ -709,8 +713,6 @@ roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
.text
.align 8
-.type __camellia_enc_blk16,@function;
-
__camellia_enc_blk16:
/* input:
* %rdi: ctx, CTX
@@ -793,10 +795,9 @@ __camellia_enc_blk16:
%xmm15, %rax, %rcx, 24);
jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk16)
.align 8
-.type __camellia_dec_blk16,@function;
-
__camellia_dec_blk16:
/* input:
* %rdi: ctx, CTX
@@ -877,12 +878,9 @@ __camellia_dec_blk16:
((key_table + (24) * 8) + 4)(CTX));
jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk16)
-.align 8
-.global camellia_ecb_enc_16way
-.type camellia_ecb_enc_16way,@function;
-
-camellia_ecb_enc_16way:
+ENTRY(camellia_ecb_enc_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
@@ -903,12 +901,9 @@ camellia_ecb_enc_16way:
%xmm8, %rsi);
ret;
+ENDPROC(camellia_ecb_enc_16way)
-.align 8
-.global camellia_ecb_dec_16way
-.type camellia_ecb_dec_16way,@function;
-
-camellia_ecb_dec_16way:
+ENTRY(camellia_ecb_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
@@ -934,12 +929,9 @@ camellia_ecb_dec_16way:
%xmm8, %rsi);
ret;
+ENDPROC(camellia_ecb_dec_16way)
-.align 8
-.global camellia_cbc_dec_16way
-.type camellia_cbc_dec_16way,@function;
-
-camellia_cbc_dec_16way:
+ENTRY(camellia_cbc_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
@@ -986,6 +978,7 @@ camellia_cbc_dec_16way:
%xmm8, %rsi);
ret;
+ENDPROC(camellia_cbc_dec_16way)
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
@@ -993,11 +986,7 @@ camellia_cbc_dec_16way:
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;
-.align 8
-.global camellia_ctr_16way
-.type camellia_ctr_16way,@function;
-
-camellia_ctr_16way:
+ENTRY(camellia_ctr_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst (16 blocks)
@@ -1100,3 +1089,4 @@ camellia_ctr_16way:
%xmm8, %rsi);
ret;
+ENDPROC(camellia_ctr_16way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 0b3374335fdc..310319c601ed 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -20,6 +20,8 @@
*
*/
+#include <linux/linkage.h>
+
.file "camellia-x86_64-asm_64.S"
.text
@@ -188,10 +190,7 @@
bswapq RAB0; \
movq RAB0, 4*2(RIO);
-.global __camellia_enc_blk;
-.type __camellia_enc_blk,@function;
-
-__camellia_enc_blk:
+ENTRY(__camellia_enc_blk)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -214,33 +213,31 @@ __camellia_enc_blk:
movl $24, RT1d; /* max */
cmpb $16, key_length(CTX);
- je __enc_done;
+ je .L__enc_done;
enc_fls(24);
enc_rounds(24);
movl $32, RT1d; /* max */
-__enc_done:
+.L__enc_done:
testb RXORbl, RXORbl;
movq RDST, RIO;
- jnz __enc_xor;
+ jnz .L__enc_xor;
enc_outunpack(mov, RT1);
movq RRBP, %rbp;
ret;
-__enc_xor:
+.L__enc_xor:
enc_outunpack(xor, RT1);
movq RRBP, %rbp;
ret;
+ENDPROC(__camellia_enc_blk)
-.global camellia_dec_blk;
-.type camellia_dec_blk,@function;
-
-camellia_dec_blk:
+ENTRY(camellia_dec_blk)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -258,12 +255,12 @@ camellia_dec_blk:
dec_inpack(RT2);
cmpb $24, RT2bl;
- je __dec_rounds16;
+ je .L__dec_rounds16;
dec_rounds(24);
dec_fls(24);
-__dec_rounds16:
+.L__dec_rounds16:
dec_rounds(16);
dec_fls(16);
dec_rounds(8);
@@ -276,6 +273,7 @@ __dec_rounds16:
movq RRBP, %rbp;
ret;
+ENDPROC(camellia_dec_blk)
/**********************************************************************
2-way camellia
@@ -426,10 +424,7 @@ __dec_rounds16:
bswapq RAB1; \
movq RAB1, 12*2(RIO);
-.global __camellia_enc_blk_2way;
-.type __camellia_enc_blk_2way,@function;
-
-__camellia_enc_blk_2way:
+ENTRY(__camellia_enc_blk_2way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -453,16 +448,16 @@ __camellia_enc_blk_2way:
movl $24, RT2d; /* max */
cmpb $16, key_length(CTX);
- je __enc2_done;
+ je .L__enc2_done;
enc_fls2(24);
enc_rounds2(24);
movl $32, RT2d; /* max */
-__enc2_done:
+.L__enc2_done:
test RXORbl, RXORbl;
movq RDST, RIO;
- jnz __enc2_xor;
+ jnz .L__enc2_xor;
enc_outunpack2(mov, RT2);
@@ -470,17 +465,15 @@ __enc2_done:
popq %rbx;
ret;
-__enc2_xor:
+.L__enc2_xor:
enc_outunpack2(xor, RT2);
movq RRBP, %rbp;
popq %rbx;
ret;
+ENDPROC(__camellia_enc_blk_2way)
-.global camellia_dec_blk_2way;
-.type camellia_dec_blk_2way,@function;
-
-camellia_dec_blk_2way:
+ENTRY(camellia_dec_blk_2way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -499,12 +492,12 @@ camellia_dec_blk_2way:
dec_inpack2(RT2);
cmpb $24, RT2bl;
- je __dec2_rounds16;
+ je .L__dec2_rounds16;
dec_rounds2(24);
dec_fls2(24);
-__dec2_rounds16:
+.L__dec2_rounds16:
dec_rounds2(16);
dec_fls2(16);
dec_rounds2(8);
@@ -518,3 +511,4 @@ __dec2_rounds16:
movq RRBP, %rbp;
movq RXOR, %rbx;
ret;
+ENDPROC(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 15b00ac7cbd3..c35fd5d6ecd2 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -23,6 +23,8 @@
*
*/
+#include <linux/linkage.h>
+
.file "cast5-avx-x86_64-asm_64.S"
.extern cast_s1
@@ -211,8 +213,6 @@
.text
.align 16
-.type __cast5_enc_blk16,@function;
-
__cast5_enc_blk16:
/* input:
* %rdi: ctx, CTX
@@ -263,14 +263,14 @@ __cast5_enc_blk16:
movzbl rr(CTX), %eax;
testl %eax, %eax;
- jnz __skip_enc;
+ jnz .L__skip_enc;
round(RL, RR, 12, 1);
round(RR, RL, 13, 2);
round(RL, RR, 14, 3);
round(RR, RL, 15, 1);
-__skip_enc:
+.L__skip_enc:
popq %rbx;
popq %rbp;
@@ -282,10 +282,9 @@ __skip_enc:
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
ret;
+ENDPROC(__cast5_enc_blk16)
.align 16
-.type __cast5_dec_blk16,@function;
-
__cast5_dec_blk16:
/* input:
* %rdi: ctx, CTX
@@ -323,14 +322,14 @@ __cast5_dec_blk16:
movzbl rr(CTX), %eax;
testl %eax, %eax;
- jnz __skip_dec;
+ jnz .L__skip_dec;
round(RL, RR, 15, 1);
round(RR, RL, 14, 3);
round(RL, RR, 13, 2);
round(RR, RL, 12, 1);
-__dec_tail:
+.L__dec_tail:
round(RL, RR, 11, 3);
round(RR, RL, 10, 2);
round(RL, RR, 9, 1);
@@ -355,15 +354,12 @@ __dec_tail:
ret;
-__skip_dec:
+.L__skip_dec:
vpsrldq $4, RKR, RKR;
- jmp __dec_tail;
+ jmp .L__dec_tail;
+ENDPROC(__cast5_dec_blk16)
-.align 16
-.global cast5_ecb_enc_16way
-.type cast5_ecb_enc_16way,@function;
-
-cast5_ecb_enc_16way:
+ENTRY(cast5_ecb_enc_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -393,12 +389,9 @@ cast5_ecb_enc_16way:
vmovdqu RL4, (7*4*4)(%r11);
ret;
+ENDPROC(cast5_ecb_enc_16way)
-.align 16
-.global cast5_ecb_dec_16way
-.type cast5_ecb_dec_16way,@function;
-
-cast5_ecb_dec_16way:
+ENTRY(cast5_ecb_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -428,12 +421,9 @@ cast5_ecb_dec_16way:
vmovdqu RL4, (7*4*4)(%r11);
ret;
+ENDPROC(cast5_ecb_dec_16way)
-.align 16
-.global cast5_cbc_dec_16way
-.type cast5_cbc_dec_16way,@function;
-
-cast5_cbc_dec_16way:
+ENTRY(cast5_cbc_dec_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -480,12 +470,9 @@ cast5_cbc_dec_16way:
popq %r12;
ret;
+ENDPROC(cast5_cbc_dec_16way)
-.align 16
-.global cast5_ctr_16way
-.type cast5_ctr_16way,@function;
-
-cast5_ctr_16way:
+ENTRY(cast5_ctr_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -556,3 +543,4 @@ cast5_ctr_16way:
popq %r12;
ret;
+ENDPROC(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 2569d0da841f..f93b6105a0ce 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,6 +23,7 @@
*
*/
+#include <linux/linkage.h>
#include "glue_helper-asm-avx.S"
.file "cast6-avx-x86_64-asm_64.S"
@@ -250,8 +251,6 @@
.text
.align 8
-.type __cast6_enc_blk8,@function;
-
__cast6_enc_blk8:
/* input:
* %rdi: ctx, CTX
@@ -295,10 +294,9 @@ __cast6_enc_blk8:
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret;
+ENDPROC(__cast6_enc_blk8)
.align 8
-.type __cast6_dec_blk8,@function;
-
__cast6_dec_blk8:
/* input:
* %rdi: ctx, CTX
@@ -341,12 +339,9 @@ __cast6_dec_blk8:
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret;
+ENDPROC(__cast6_dec_blk8)
-.align 8
-.global cast6_ecb_enc_8way
-.type cast6_ecb_enc_8way,@function;
-
-cast6_ecb_enc_8way:
+ENTRY(cast6_ecb_enc_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -362,12 +357,9 @@ cast6_ecb_enc_8way:
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
+ENDPROC(cast6_ecb_enc_8way)
-.align 8
-.global cast6_ecb_dec_8way
-.type cast6_ecb_dec_8way,@function;
-
-cast6_ecb_dec_8way:
+ENTRY(cast6_ecb_dec_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -383,12 +375,9 @@ cast6_ecb_dec_8way:
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
+ENDPROC(cast6_ecb_dec_8way)
-.align 8
-.global cast6_cbc_dec_8way
-.type cast6_cbc_dec_8way,@function;
-
-cast6_cbc_dec_8way:
+ENTRY(cast6_cbc_dec_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -409,12 +398,9 @@ cast6_cbc_dec_8way:
popq %r12;
ret;
+ENDPROC(cast6_cbc_dec_8way)
-.align 8
-.global cast6_ctr_8way
-.type cast6_ctr_8way,@function;
-
-cast6_ctr_8way:
+ENTRY(cast6_ctr_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -437,3 +423,4 @@ cast6_ctr_8way:
popq %r12;
ret;
+ENDPROC(cast6_ctr_8way)
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
new file mode 100644
index 000000000000..c8335014a044
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -0,0 +1,246 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+ .octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+ .octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+.Lconstant_R5:
+ .octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+ .octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+ .octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define BUF %rdi
+#define LEN %rsi
+#define CRC %edx
+#else
+#define BUF %eax
+#define LEN %edx
+#define CRC %ecx
+#endif
+
+
+
+.text
+/**
+ * Calculate crc32
+ * BUF - buffer (16 bytes aligned)
+ * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pclmul_le_16(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+.globl crc32_pclmul_le_16
+.align 4, 0x90
+crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
+ movdqa (BUF), %xmm1
+ movdqa 0x10(BUF), %xmm2
+ movdqa 0x20(BUF), %xmm3
+ movdqa 0x30(BUF), %xmm4
+ movd CRC, CONSTANT
+ pxor CONSTANT, %xmm1
+ sub $0x40, LEN
+ add $0x40, BUF
+#ifndef __x86_64__
+ /* This is for position independent code(-fPIC) support for 32bit */
+ call delta
+delta:
+ pop %ecx
+#endif
+ cmp $0x40, LEN
+ jb less_64
+
+#ifdef __x86_64__
+ movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+#endif
+
+loop_64:/* 64 bytes Full cache line folding */
+ prefetchnta 0x40(BUF)
+ movdqa %xmm1, %xmm5
+ movdqa %xmm2, %xmm6
+ movdqa %xmm3, %xmm7
+#ifdef __x86_64__
+ movdqa %xmm4, %xmm8
+#endif
+ PCLMULQDQ 00, CONSTANT, %xmm1
+ PCLMULQDQ 00, CONSTANT, %xmm2
+ PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+ PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ PCLMULQDQ 0x11, CONSTANT, %xmm6
+ PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+ PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+ pxor %xmm5, %xmm1
+ pxor %xmm6, %xmm2
+ pxor %xmm7, %xmm3
+#ifdef __x86_64__
+ pxor %xmm8, %xmm4
+#else
+ /* xmm8 unsupported for x32 */
+ movdqa %xmm4, %xmm5
+ PCLMULQDQ 00, CONSTANT, %xmm4
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm4
+#endif
+
+ pxor (BUF), %xmm1
+ pxor 0x10(BUF), %xmm2
+ pxor 0x20(BUF), %xmm3
+ pxor 0x30(BUF), %xmm4
+
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jge loop_64
+less_64:/* Folding cache line into 128bit */
+#ifdef __x86_64__
+ movdqa .Lconstant_R4R3(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
+#endif
+ prefetchnta (BUF)
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm2, %xmm1
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm3, %xmm1
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm4, %xmm1
+
+ cmp $0x10, LEN
+ jb fold_64
+loop_16:/* Folding rest buffer into 128bit */
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor (BUF), %xmm1
+ sub $0x10, LEN
+ add $0x10, BUF
+ cmp $0x10, LEN
+ jge loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+ psrldq $0x08, %xmm1
+ pxor CONSTANT, %xmm1
+
+ /* final 32-bit fold */
+ movdqa %xmm1, %xmm2
+#ifdef __x86_64__
+ movdqa .Lconstant_R5(%rip), CONSTANT
+ movdqa .Lconstant_mask32(%rip), %xmm3
+#else
+ movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
+ movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
+#endif
+ psrldq $0x04, %xmm2
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+ movdqa .Lconstant_RUpoly(%rip), CONSTANT
+#else
+ movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
+#endif
+ movdqa %xmm1, %xmm2
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x10, CONSTANT, %xmm1
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+ pextrd $0x01, %xmm1, %eax
+
+ ret
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
new file mode 100644
index 000000000000..9d014a74ef96
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -0,0 +1,201 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/i387.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define PCLMUL_MIN_LEN 64L /* minimum size of buffer
+ * for crc32_pclmul_le_16 */
+#define SCALE_F 16L /* size of xmm register */
+#define SCALE_F_MASK (SCALE_F - 1)
+
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+
+static u32 __attribute__((pure))
+ crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient;
+ unsigned int iremainder;
+ unsigned int prealign;
+
+ if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable())
+ return crc32_le(crc, p, len);
+
+ if ((long)p & SCALE_F_MASK) {
+ /* align p to 16 byte */
+ prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+
+ crc = crc32_le(crc, p, prealign);
+ len -= prealign;
+ p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+ ~SCALE_F_MASK);
+ }
+ iquotient = len & (~SCALE_F_MASK);
+ iremainder = len & SCALE_F_MASK;
+
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(p, iquotient, crc);
+ kernel_fpu_end();
+
+ if (iremainder)
+ crc = crc32_le(crc, p + iquotient, iremainder);
+
+ return crc;
+}
+
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = 0;
+
+ return 0;
+}
+
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = crc32_pclmul_le(*crcp, data, len);
+ return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+ return 0;
+}
+
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *(__le32 *)out = cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
+static struct shash_alg alg = {
+ .setkey = crc32_pclmul_setkey,
+ .init = crc32_pclmul_init,
+ .update = crc32_pclmul_update,
+ .final = crc32_pclmul_final,
+ .finup = crc32_pclmul_finup,
+ .digest = crc32_pclmul_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32",
+ .cra_driver_name = "crc32-pclmul",
+ .cra_priority = 200,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_pclmul_cra_init,
+ }
+};
+
+static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
+
+
+static int __init crc32_pclmul_mod_init(void)
+{
+
+ if (!x86_match_cpu(crc32pclmul_cpu_id)) {
+ pr_info("PCLMULQDQ-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc32_pclmul_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crc32_pclmul_mod_init);
+module_exit(crc32_pclmul_mod_fini);
+
+MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crc32");
+MODULE_ALIAS("crc32-pclmul");
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 93c6d39237ac..cf1a7ec4cc3a 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -42,6 +42,8 @@
* SOFTWARE.
*/
+#include <linux/linkage.h>
+
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
.macro LABEL prefix n
@@ -68,8 +70,7 @@
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
-.global crc_pcl
-crc_pcl:
+ENTRY(crc_pcl)
#define bufp %rdi
#define bufp_dw %edi
#define bufp_w %di
@@ -323,6 +324,9 @@ JMPTBL_ENTRY %i
.noaltmacro
i=i+1
.endr
+
+ENDPROC(crc_pcl)
+
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 quad words each
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 1eb7f90cb7b9..586f41aac361 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -94,6 +94,7 @@ __clmul_gf128mul_ble:
pxor T2, T1
pxor T1, DATA
ret
+ENDPROC(__clmul_gf128mul_ble)
/* void clmul_ghash_mul(char *dst, const be128 *shash) */
ENTRY(clmul_ghash_mul)
@@ -105,6 +106,7 @@ ENTRY(clmul_ghash_mul)
PSHUFB_XMM BSWAP DATA
movups DATA, (%rdi)
ret
+ENDPROC(clmul_ghash_mul)
/*
* void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
@@ -131,6 +133,7 @@ ENTRY(clmul_ghash_update)
movups DATA, (%rdi)
.Lupdate_just_ret:
ret
+ENDPROC(clmul_ghash_update)
/*
* void clmul_ghash_setkey(be128 *shash, const u8 *key);
@@ -155,3 +158,4 @@ ENTRY(clmul_ghash_setkey)
pxor %xmm1, %xmm0
movups %xmm0, (%rdi)
ret
+ENDPROC(clmul_ghash_setkey)
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
index 72eb306680b2..329452b8f794 100644
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -2,11 +2,12 @@
# D. J. Bernstein
# Public domain.
-# enter ECRYPT_encrypt_bytes
+#include <linux/linkage.h>
+
.text
-.p2align 5
-.globl ECRYPT_encrypt_bytes
-ECRYPT_encrypt_bytes:
+
+# enter salsa20_encrypt_bytes
+ENTRY(salsa20_encrypt_bytes)
mov %esp,%eax
and $31,%eax
add $256,%eax
@@ -933,11 +934,10 @@ ECRYPT_encrypt_bytes:
add $64,%esi
# goto bytesatleast1
jmp ._bytesatleast1
-# enter ECRYPT_keysetup
-.text
-.p2align 5
-.globl ECRYPT_keysetup
-ECRYPT_keysetup:
+ENDPROC(salsa20_encrypt_bytes)
+
+# enter salsa20_keysetup
+ENTRY(salsa20_keysetup)
mov %esp,%eax
and $31,%eax
add $256,%eax
@@ -1060,11 +1060,10 @@ ECRYPT_keysetup:
# leave
add %eax,%esp
ret
-# enter ECRYPT_ivsetup
-.text
-.p2align 5
-.globl ECRYPT_ivsetup
-ECRYPT_ivsetup:
+ENDPROC(salsa20_keysetup)
+
+# enter salsa20_ivsetup
+ENTRY(salsa20_ivsetup)
mov %esp,%eax
and $31,%eax
add $256,%eax
@@ -1112,3 +1111,4 @@ ECRYPT_ivsetup:
# leave
add %eax,%esp
ret
+ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
index 6214a9b09706..9279e0b2d60e 100644
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -1,8 +1,7 @@
-# enter ECRYPT_encrypt_bytes
-.text
-.p2align 5
-.globl ECRYPT_encrypt_bytes
-ECRYPT_encrypt_bytes:
+#include <linux/linkage.h>
+
+# enter salsa20_encrypt_bytes
+ENTRY(salsa20_encrypt_bytes)
mov %rsp,%r11
and $31,%r11
add $256,%r11
@@ -802,11 +801,10 @@ ECRYPT_encrypt_bytes:
# comment:fp stack unchanged by jump
# goto bytesatleast1
jmp ._bytesatleast1
-# enter ECRYPT_keysetup
-.text
-.p2align 5
-.globl ECRYPT_keysetup
-ECRYPT_keysetup:
+ENDPROC(salsa20_encrypt_bytes)
+
+# enter salsa20_keysetup
+ENTRY(salsa20_keysetup)
mov %rsp,%r11
and $31,%r11
add $256,%r11
@@ -892,11 +890,10 @@ ECRYPT_keysetup:
mov %rdi,%rax
mov %rsi,%rdx
ret
-# enter ECRYPT_ivsetup
-.text
-.p2align 5
-.globl ECRYPT_ivsetup
-ECRYPT_ivsetup:
+ENDPROC(salsa20_keysetup)
+
+# enter salsa20_ivsetup
+ENTRY(salsa20_ivsetup)
mov %rsp,%r11
and $31,%r11
add $256,%r11
@@ -918,3 +915,4 @@ ECRYPT_ivsetup:
mov %rdi,%rax
mov %rsi,%rdx
ret
+ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index a3a3c0205c16..5e8e67739bb5 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -26,11 +26,6 @@
#define SALSA20_MIN_KEY_SIZE 16U
#define SALSA20_MAX_KEY_SIZE 32U
-// use the ECRYPT_* function names
-#define salsa20_keysetup ECRYPT_keysetup
-#define salsa20_ivsetup ECRYPT_ivsetup
-#define salsa20_encrypt_bytes ECRYPT_encrypt_bytes
-
struct salsa20_ctx
{
u32 input[16];
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 02b0e9fe997c..43c938612b74 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
*
*/
+#include <linux/linkage.h>
#include "glue_helper-asm-avx.S"
.file "serpent-avx-x86_64-asm_64.S"
@@ -566,8 +567,6 @@
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
.align 8
-.type __serpent_enc_blk8_avx,@function;
-
__serpent_enc_blk8_avx:
/* input:
* %rdi: ctx, CTX
@@ -619,10 +618,9 @@ __serpent_enc_blk8_avx:
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret;
+ENDPROC(__serpent_enc_blk8_avx)
.align 8
-.type __serpent_dec_blk8_avx,@function;
-
__serpent_dec_blk8_avx:
/* input:
* %rdi: ctx, CTX
@@ -674,12 +672,9 @@ __serpent_dec_blk8_avx:
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
ret;
+ENDPROC(__serpent_dec_blk8_avx)
-.align 8
-.global serpent_ecb_enc_8way_avx
-.type serpent_ecb_enc_8way_avx,@function;
-
-serpent_ecb_enc_8way_avx:
+ENTRY(serpent_ecb_enc_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -693,12 +688,9 @@ serpent_ecb_enc_8way_avx:
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
+ENDPROC(serpent_ecb_enc_8way_avx)
-.align 8
-.global serpent_ecb_dec_8way_avx
-.type serpent_ecb_dec_8way_avx,@function;
-
-serpent_ecb_dec_8way_avx:
+ENTRY(serpent_ecb_dec_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -712,12 +704,9 @@ serpent_ecb_dec_8way_avx:
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
ret;
+ENDPROC(serpent_ecb_dec_8way_avx)
-.align 8
-.global serpent_cbc_dec_8way_avx
-.type serpent_cbc_dec_8way_avx,@function;
-
-serpent_cbc_dec_8way_avx:
+ENTRY(serpent_cbc_dec_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -731,12 +720,9 @@ serpent_cbc_dec_8way_avx:
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
ret;
+ENDPROC(serpent_cbc_dec_8way_avx)
-.align 8
-.global serpent_ctr_8way_avx
-.type serpent_ctr_8way_avx,@function;
-
-serpent_ctr_8way_avx:
+ENTRY(serpent_ctr_8way_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -752,3 +738,4 @@ serpent_ctr_8way_avx:
store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
+ENDPROC(serpent_ctr_8way_avx)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index c00053d42f99..d348f1553a79 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -24,6 +24,8 @@
*
*/
+#include <linux/linkage.h>
+
.file "serpent-sse2-i586-asm_32.S"
.text
@@ -510,11 +512,7 @@
pxor t0, x3; \
movdqu x3, (3*4*4)(out);
-.align 8
-.global __serpent_enc_blk_4way
-.type __serpent_enc_blk_4way,@function;
-
-__serpent_enc_blk_4way:
+ENTRY(__serpent_enc_blk_4way)
/* input:
* arg_ctx(%esp): ctx, CTX
* arg_dst(%esp): dst
@@ -566,22 +564,19 @@ __serpent_enc_blk_4way:
movl arg_dst(%esp), %eax;
cmpb $0, arg_xor(%esp);
- jnz __enc_xor4;
+ jnz .L__enc_xor4;
write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
ret;
-__enc_xor4:
+.L__enc_xor4:
xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
ret;
+ENDPROC(__serpent_enc_blk_4way)
-.align 8
-.global serpent_dec_blk_4way
-.type serpent_dec_blk_4way,@function;
-
-serpent_dec_blk_4way:
+ENTRY(serpent_dec_blk_4way)
/* input:
* arg_ctx(%esp): ctx, CTX
* arg_dst(%esp): dst
@@ -633,3 +628,4 @@ serpent_dec_blk_4way:
write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
ret;
+ENDPROC(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index 3ee1ff04d3e9..acc066c7c6b2 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -24,6 +24,8 @@
*
*/
+#include <linux/linkage.h>
+
.file "serpent-sse2-x86_64-asm_64.S"
.text
@@ -632,11 +634,7 @@
pxor t0, x3; \
movdqu x3, (3*4*4)(out);
-.align 8
-.global __serpent_enc_blk_8way
-.type __serpent_enc_blk_8way,@function;
-
-__serpent_enc_blk_8way:
+ENTRY(__serpent_enc_blk_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -687,24 +685,21 @@ __serpent_enc_blk_8way:
leaq (4*4*4)(%rsi), %rax;
testb %cl, %cl;
- jnz __enc_xor8;
+ jnz .L__enc_xor8;
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret;
-__enc_xor8:
+.L__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret;
+ENDPROC(__serpent_enc_blk_8way)
-.align 8
-.global serpent_dec_blk_8way
-.type serpent_dec_blk_8way,@function;
-
-serpent_dec_blk_8way:
+ENTRY(serpent_dec_blk_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -756,3 +751,4 @@ serpent_dec_blk_8way:
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
ret;
+ENDPROC(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 49d6987a73d9..a4109506a5e8 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -28,6 +28,8 @@
* (at your option) any later version.
*/
+#include <linux/linkage.h>
+
#define CTX %rdi // arg1
#define BUF %rsi // arg2
#define CNT %rdx // arg3
@@ -69,10 +71,8 @@
* param: function's name
*/
.macro SHA1_VECTOR_ASM name
- .global \name
- .type \name, @function
- .align 32
-\name:
+ ENTRY(\name)
+
push %rbx
push %rbp
push %r12
@@ -106,7 +106,7 @@
pop %rbx
ret
- .size \name, .-\name
+ ENDPROC(\name)
.endm
/*
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index ebac16bfa830..8d3e113b2c95 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,6 +23,7 @@
*
*/
+#include <linux/linkage.h>
#include "glue_helper-asm-avx.S"
.file "twofish-avx-x86_64-asm_64.S"
@@ -243,8 +244,6 @@
vpxor x3, wkey, x3;
.align 8
-.type __twofish_enc_blk8,@function;
-
__twofish_enc_blk8:
/* input:
* %rdi: ctx, CTX
@@ -284,10 +283,9 @@ __twofish_enc_blk8:
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
ret;
+ENDPROC(__twofish_enc_blk8)
.align 8
-.type __twofish_dec_blk8,@function;
-
__twofish_dec_blk8:
/* input:
* %rdi: ctx, CTX
@@ -325,12 +323,9 @@ __twofish_dec_blk8:
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
ret;
+ENDPROC(__twofish_dec_blk8)
-.align 8
-.global twofish_ecb_enc_8way
-.type twofish_ecb_enc_8way,@function;
-
-twofish_ecb_enc_8way:
+ENTRY(twofish_ecb_enc_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -346,12 +341,9 @@ twofish_ecb_enc_8way:
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
ret;
+ENDPROC(twofish_ecb_enc_8way)
-.align 8
-.global twofish_ecb_dec_8way
-.type twofish_ecb_dec_8way,@function;
-
-twofish_ecb_dec_8way:
+ENTRY(twofish_ecb_dec_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -367,12 +359,9 @@ twofish_ecb_dec_8way:
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
+ENDPROC(twofish_ecb_dec_8way)
-.align 8
-.global twofish_cbc_dec_8way
-.type twofish_cbc_dec_8way,@function;
-
-twofish_cbc_dec_8way:
+ENTRY(twofish_cbc_dec_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -393,12 +382,9 @@ twofish_cbc_dec_8way:
popq %r12;
ret;
+ENDPROC(twofish_cbc_dec_8way)
-.align 8
-.global twofish_ctr_8way
-.type twofish_ctr_8way,@function;
-
-twofish_ctr_8way:
+ENTRY(twofish_ctr_8way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -421,3 +407,4 @@ twofish_ctr_8way:
popq %r12;
ret;
+ENDPROC(twofish_ctr_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 658af4bb35c9..694ea4587ba7 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -20,6 +20,7 @@
.file "twofish-i586-asm.S"
.text
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
/* return address at 0 */
@@ -219,11 +220,7 @@
xor %esi, d ## D;\
ror $1, d ## D;
-.align 4
-.global twofish_enc_blk
-.global twofish_dec_blk
-
-twofish_enc_blk:
+ENTRY(twofish_enc_blk)
push %ebp /* save registers according to calling convention*/
push %ebx
push %esi
@@ -277,8 +274,9 @@ twofish_enc_blk:
pop %ebp
mov $1, %eax
ret
+ENDPROC(twofish_enc_blk)
-twofish_dec_blk:
+ENTRY(twofish_dec_blk)
push %ebp /* save registers according to calling convention*/
push %ebx
push %esi
@@ -333,3 +331,4 @@ twofish_dec_blk:
pop %ebp
mov $1, %eax
ret
+ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index 5b012a2c5119..1c3b7ceb36d2 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -20,6 +20,8 @@
*
*/
+#include <linux/linkage.h>
+
.file "twofish-x86_64-asm-3way.S"
.text
@@ -214,11 +216,7 @@
rorq $32, RAB2; \
outunpack3(mov, RIO, 2, RAB, 2);
-.align 8
-.global __twofish_enc_blk_3way
-.type __twofish_enc_blk_3way,@function;
-
-__twofish_enc_blk_3way:
+ENTRY(__twofish_enc_blk_3way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -250,7 +248,7 @@ __twofish_enc_blk_3way:
popq %rbp; /* bool xor */
testb %bpl, %bpl;
- jnz __enc_xor3;
+ jnz .L__enc_xor3;
outunpack_enc3(mov);
@@ -262,7 +260,7 @@ __twofish_enc_blk_3way:
popq %r15;
ret;
-__enc_xor3:
+.L__enc_xor3:
outunpack_enc3(xor);
popq %rbx;
@@ -272,11 +270,9 @@ __enc_xor3:
popq %r14;
popq %r15;
ret;
+ENDPROC(__twofish_enc_blk_3way)
-.global twofish_dec_blk_3way
-.type twofish_dec_blk_3way,@function;
-
-twofish_dec_blk_3way:
+ENTRY(twofish_dec_blk_3way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -313,4 +309,4 @@ twofish_dec_blk_3way:
popq %r14;
popq %r15;
ret;
-
+ENDPROC(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 7bcf3fcc3668..a039d21986a2 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -20,6 +20,7 @@
.file "twofish-x86_64-asm.S"
.text
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#define a_offset 0
@@ -214,11 +215,7 @@
xor %r8d, d ## D;\
ror $1, d ## D;
-.align 8
-.global twofish_enc_blk
-.global twofish_dec_blk
-
-twofish_enc_blk:
+ENTRY(twofish_enc_blk)
pushq R1
/* %rdi contains the ctx address */
@@ -269,8 +266,9 @@ twofish_enc_blk:
popq R1
movq $1,%rax
ret
+ENDPROC(twofish_enc_blk)
-twofish_dec_blk:
+ENTRY(twofish_dec_blk)
pushq R1
/* %rdi contains the ctx address */
@@ -320,3 +318,4 @@ twofish_dec_blk:
popq R1
movq $1,%rax
ret
+ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index a1daf4a65009..cf1a471a18a2 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -129,13 +129,6 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
return err;
}
-asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- sigset_t blocked;
- siginitset(&blocked, mask);
- return sigsuspend(&blocked);
-}
-
/*
* Do a signal return; undo the signal stack.
*/
@@ -215,8 +208,9 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
return err;
}
-asmlinkage long sys32_sigreturn(struct pt_regs *regs)
+asmlinkage long sys32_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
unsigned int ax;
@@ -241,8 +235,9 @@ badframe:
return 0;
}
-asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
+asmlinkage long sys32_rt_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_ia32 __user *frame;
sigset_t set;
unsigned int ax;
@@ -314,7 +309,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
/*
* Determine which stack to use..
*/
-static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
+static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
size_t frame_size,
void __user **fpstate)
{
@@ -324,16 +319,13 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
sp = regs->sp;
/* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
+ if (ksig->ka.sa.sa_flags & SA_ONSTACK)
+ sp = sigsp(sp, ksig);
/* This is the legacy signal stack switching. */
else if ((regs->ss & 0xffff) != __USER32_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
- sp = (unsigned long) ka->sa.sa_restorer;
+ !(ksig->ka.sa.sa_flags & SA_RESTORER) &&
+ ksig->ka.sa.sa_restorer)
+ sp = (unsigned long) ksig->ka.sa.sa_restorer;
if (used_math()) {
unsigned long fx_aligned, math_size;
@@ -352,7 +344,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
return (void __user *) sp;
}
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
+int ia32_setup_frame(int sig, struct ksignal *ksig,
compat_sigset_t *set, struct pt_regs *regs)
{
struct sigframe_ia32 __user *frame;
@@ -371,7 +363,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
0x80cd, /* int $0x80 */
};
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -388,8 +380,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
return -EFAULT;
}
- if (ka->sa.sa_flags & SA_RESTORER) {
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
} else {
/* Return stub is in 32bit vsyscall page */
if (current->mm->context.vdso)
@@ -414,7 +406,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
/* Set up registers for signal handler */
regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* Make -mregparm=3 work */
regs->ax = sig;
@@ -430,7 +422,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
return 0;
}
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
compat_sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe_ia32 __user *frame;
@@ -451,7 +443,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
0,
};
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -469,8 +461,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(0, &frame->uc.uc_link);
err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
else
restorer = VDSO32_SYMBOL(current->mm->context.vdso,
rt_sigreturn);
@@ -483,7 +475,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
} put_user_catch(err);
- err |= copy_siginfo_to_user32(&frame->info, info);
+ err |= copy_siginfo_to_user32(&frame->info, &ksig->info);
err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -493,7 +485,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up registers for signal handler */
regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* Make -mregparm=3 work */
regs->ax = sig;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 142c4ceff112..474dc1b59f72 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -456,18 +456,16 @@ ia32_badsys:
ALIGN
GLOBAL(\label)
leaq \func(%rip),%rax
- leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
jmp ia32_ptregs_common
.endm
CFI_STARTPROC32
- PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
- PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
- PTREGSCALL stub32_execve, compat_sys_execve, %rcx
- PTREGSCALL stub32_fork, sys_fork, %rdi
- PTREGSCALL stub32_vfork, sys_vfork, %rdi
- PTREGSCALL stub32_iopl, sys_iopl, %rsi
+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
+ PTREGSCALL stub32_sigreturn, sys32_sigreturn
+ PTREGSCALL stub32_execve, compat_sys_execve
+ PTREGSCALL stub32_fork, sys_fork
+ PTREGSCALL stub32_vfork, sys_vfork
ALIGN
GLOBAL(stub32_clone)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index d0b689ba7be2..592f5a9a9c0e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -172,183 +172,12 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len,
return sys_mprotect(start, len, prot);
}
-asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
- struct sigaction32 __user *oact,
- unsigned int sigsetsize)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
- compat_sigset_t set32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
-
- if (act) {
- compat_uptr_t handler, restorer;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
- __get_user(restorer, &act->sa_restorer) ||
- __copy_from_user(&set32, &act->sa_mask,
- sizeof(compat_sigset_t)))
- return -EFAULT;
- new_ka.sa.sa_handler = compat_ptr(handler);
- new_ka.sa.sa_restorer = compat_ptr(restorer);
-
- /*
- * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
- * than _NSIG_WORDS << 1
- */
- switch (_NSIG_WORDS) {
- case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
- | (((long)set32.sig[7]) << 32);
- case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
- | (((long)set32.sig[5]) << 32);
- case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
- | (((long)set32.sig[3]) << 32);
- case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
- | (((long)set32.sig[1]) << 32);
- }
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- /*
- * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
- * than _NSIG_WORDS << 1
- */
- switch (_NSIG_WORDS) {
- case 4:
- set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
- set32.sig[6] = old_ka.sa.sa_mask.sig[3];
- case 3:
- set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
- set32.sig[4] = old_ka.sa.sa_mask.sig[2];
- case 2:
- set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
- set32.sig[2] = old_ka.sa.sa_mask.sig[1];
- case 1:
- set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
- set32.sig[0] = old_ka.sa.sa_mask.sig[0];
- }
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_handler),
- &oact->sa_handler) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
- &oact->sa_restorer) ||
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
- __copy_to_user(&oact->sa_mask, &set32,
- sizeof(compat_sigset_t)))
- return -EFAULT;
- }
-
- return ret;
-}
-
-asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
- struct old_sigaction32 __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
-
- if (act) {
- compat_old_sigset_t mask;
- compat_uptr_t handler, restorer;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
- __get_user(restorer, &act->sa_restorer) ||
- __get_user(mask, &act->sa_mask))
- return -EFAULT;
-
- new_ka.sa.sa_handler = compat_ptr(handler);
- new_ka.sa.sa_restorer = compat_ptr(restorer);
-
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_handler),
- &oact->sa_handler) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
- &oact->sa_restorer) ||
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
- return -EFAULT;
- }
-
- return ret;
-}
-
asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr,
int options)
{
return compat_sys_wait4(pid, stat_addr, options, NULL);
}
-/* 32-bit timeval and related flotsam. */
-
-asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
- struct compat_timespec __user *interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (put_compat_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
-asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set,
- compat_size_t sigsetsize)
-{
- sigset_t s;
- compat_sigset_t s32;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
- set_fs(old_fs);
- if (!ret) {
- switch (_NSIG_WORDS) {
- case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
- case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
- case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
- case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
- }
- if (copy_to_user(set, &s32, sizeof(compat_sigset_t)))
- return -EFAULT;
- }
- return ret;
-}
-
-asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
- compat_siginfo_t __user *uinfo)
-{
- siginfo_t info;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- set_fs(KERNEL_DS);
- ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
- set_fs(old_fs);
- return ret;
-}
-
/* warning: next two assume little endian */
asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
u32 poslo, u32 poshi)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 0c44630d1789..b31bf97775fc 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -49,10 +49,6 @@
/* Asm macros */
-#define ACPI_ASM_MACROS
-#define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS() local_irq_enable()
#define ACPI_FLUSH_CPU_CACHE() wbinvd()
int __acpi_acquire_global_lock(unsigned int *lock);
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 41ab26ea6564..e25cc33ec54d 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -26,9 +26,10 @@
#ifdef CONFIG_X86_64
# include <asm/sigcontext32.h>
# include <asm/user32.h>
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+struct ksignal;
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
+int ia32_setup_frame(int sig, struct ksignal *ksig,
compat_sigset_t *set, struct pt_regs *regs);
#else
# define user_i387_ia32_struct user_i387_struct
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 86cb51e1ca96..0525a8bdf65d 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -72,4 +72,28 @@ int ftrace_int3_handler(struct pt_regs *regs);
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
+
+#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS)
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
+#include <asm/compat.h>
+
+/*
+ * Because ia32 syscalls do not map to x86_64 syscall numbers
+ * this screws up the trace output when tracing a ia32 task.
+ * Instead of reporting bogus syscalls, just do not trace them.
+ *
+ * If the user realy wants these, then they should use the
+ * raw syscall tracepoints with filtering.
+ */
+#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
+static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
+{
+ if (is_compat_task())
+ return true;
+ return false;
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
+#endif /* !__ASSEMBLY__ && !COMPILE_OFFSETS */
+
#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 4c6da2e4bb1d..d0e8e0141041 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -13,21 +13,6 @@
#include <asm/sigcontext32.h>
/* signal.h */
-struct sigaction32 {
- unsigned int sa_handler; /* Really a pointer, but need to deal
- with 32 bits */
- unsigned int sa_flags;
- unsigned int sa_restorer; /* Another 32 bit pointer */
- compat_sigset_t sa_mask; /* A 32 bit mask */
-};
-
-struct old_sigaction32 {
- unsigned int sa_handler; /* Really a pointer, but need to deal
- with 32 bits */
- compat_old_sigset_t sa_mask; /* A 32 bit mask */
- unsigned int sa_flags;
- unsigned int sa_restorer; /* Another 32 bit pointer */
-};
struct ucontext_ia32 {
unsigned int uc_flags;
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index adcc0ae73d09..223042086f4e 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -1,20 +1,14 @@
-#ifndef _ASM_X86_INIT_32_H
-#define _ASM_X86_INIT_32_H
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
+struct x86_mapping_info {
+ void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+ void *context; /* context for alloc_pgt_page */
+ unsigned long pmd_flag; /* page flag for PMD entry */
+ bool kernel_mapping; /* kernel mapping or ident mapping */
+};
-extern void __init zone_sizes_init(void);
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end);
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask);
-
-
-extern unsigned long __initdata pgt_buf_start;
-extern unsigned long __meminitdata pgt_buf_end;
-extern unsigned long __meminitdata pgt_buf_top;
-
-#endif /* _ASM_X86_INIT_32_H */
+#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6080d2694bad..17483a492f18 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -48,11 +48,11 @@
# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
#else
/* Maximum physical address we can use pages from */
-# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
/* Maximum address we can reach in physical address mode */
-# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
/* Maximum address we can use for the control pages */
-# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
/* Allocate one page for the pdp and the second for the code */
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65e9c3a..635a74d22409 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,10 +33,10 @@
#define KVM_MAX_VCPUS 254
#define KVM_SOFT_MAX_VCPUS 160
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+#define KVM_USER_MEM_SLOTS 125
+/* memory slots that are not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 3
+#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
#define KVM_MMIO_SIZE 16
@@ -219,11 +219,6 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- /*
- * One bit set per slot which has memory
- * in this shadow page.
- */
- DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
@@ -502,6 +497,13 @@ struct kvm_vcpu_arch {
u64 msr_val;
struct gfn_to_hva_cache data;
} pv_eoi;
+
+ /*
+ * Indicate whether the access faults on its page table in guest
+ * which is set when fix page fault and used to detect unhandeable
+ * instruction.
+ */
+ bool write_fault_to_shadow_pgtable;
};
struct kvm_lpage_info {
@@ -697,6 +699,11 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+ int (*vm_has_apicv)(struct kvm *kvm);
+ void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+ void (*hwapic_isr_update)(struct kvm *kvm, int isr);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -991,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65231e173baf..695399f2d5eb 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -27,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
*
* Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
* The hypercall number should be placed in rax and the return value will be
- * placed in rax. No other registers will be clobbered unless explicited
+ * placed in rax. No other registers will be clobbered unless explicitly
* noted by the particular hypercall.
*/
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 43d921b4752c..6825e2efd1b4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -57,4 +57,18 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
static inline void __exit exit_amd_microcode(void) {}
#endif
+#ifdef CONFIG_MICROCODE_EARLY
+#define MAX_UCODE_COUNT 128
+extern void __init load_ucode_bsp(void);
+extern __init void load_ucode_ap(void);
+extern int __init save_microcode_in_initrd(void);
+#else
+static inline void __init load_ucode_bsp(void) {}
+static inline __init void load_ucode_ap(void) {}
+static inline int __init save_microcode_in_initrd(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
new file mode 100644
index 000000000000..5356f927d411
--- /dev/null
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -0,0 +1,85 @@
+#ifndef _ASM_X86_MICROCODE_INTEL_H
+#define _ASM_X86_MICROCODE_INTEL_H
+
+#include <asm/microcode.h>
+
+struct microcode_header_intel {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int reserved[3];
+};
+
+struct microcode_intel {
+ struct microcode_header_intel hdr;
+ unsigned int bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+};
+
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+#define DWSIZE (sizeof(u32))
+
+#define get_totalsize(mc) \
+ (((struct microcode_intel *)mc)->hdr.totalsize ? \
+ ((struct microcode_intel *)mc)->hdr.totalsize : \
+ DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
+ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+extern int
+get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
+extern int microcode_sanity_check(void *mc, int print_err);
+extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev);
+extern int
+update_match_revision(struct microcode_header_intel *mc_header, int rev);
+
+#ifdef CONFIG_MICROCODE_INTEL_EARLY
+extern void __init load_ucode_intel_bsp(void);
+extern void __cpuinit load_ucode_intel_ap(void);
+extern void show_ucode_info_early(void);
+#else
+static inline __init void load_ucode_intel_bsp(void) {}
+static inline __cpuinit void load_ucode_intel_ap(void) {}
+static inline void show_ucode_info_early(void) {}
+#endif
+
+#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+extern int save_mc_for_early(u8 *mc);
+#else
+static inline int save_mc_for_early(u8 *mc)
+{
+ return 0;
+}
+#endif
+
+#endif /* _ASM_X86_MICROCODE_INTEL_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index eb05fb3b02fb..8a9b3e288cb4 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
#include <asm/numaq.h>
-extern void resume_map_numa_kva(pgd_t *pgd);
-
-#else /* !CONFIG_NUMA */
-
-static inline void resume_map_numa_kva(pgd_t *pgd) {}
-
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index bcdff997668c..2f366d0ac6b4 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -4,7 +4,8 @@
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
#define MWAIT_SUBSTATE_SIZE 4
-#define MWAIT_MAX_NUM_CSTATES 8
+#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK)
+#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
#define CPUID_MWAIT_LEAF 5
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 49119fcea2dc..1b99ee5c9f00 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -54,13 +54,11 @@ static inline int numa_cpu_node(int cpu)
#ifdef CONFIG_X86_32
# include <asm/numa_32.h>
-#else
-# include <asm/numa_64.h>
#endif
#ifdef CONFIG_NUMA
-extern void __cpuinit numa_set_node(int cpu, int node);
-extern void __cpuinit numa_clear_node(int cpu);
+extern void numa_set_node(int cpu, int node);
+extern void numa_clear_node(int cpu);
extern void __init init_cpu_to_node(void);
extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644
index 0c05f7ae46e8..000000000000
--- a/arch/x86/include/asm/numa_64.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_X86_NUMA_64_H
-#define _ASM_X86_NUMA_64_H
-
-extern unsigned long numa_free_all_bootmem(void);
-
-#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 8ca82839288a..c87892442e53 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -17,6 +17,10 @@
struct page;
+#include <linux/range.h>
+extern struct range pfn_mapped[];
+extern int nr_pfn_mapped;
+
static inline void clear_user_page(void *page, unsigned long vaddr,
struct page *pg)
{
@@ -44,7 +48,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* case properly. Once all supported versions of gcc understand it, we can
* remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
*/
-#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+#define __pa_symbol(x) \
+ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index da4e762406f7..4d550d04b609 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -15,6 +15,7 @@ extern unsigned long __phys_addr(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
#endif
+#define __phys_addr_symbol(x) __phys_addr(x)
#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
#ifdef CONFIG_FLATMEM
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 072694ed81a5..0f1ddee6a0ce 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -3,4 +3,40 @@
#include <asm/page_64_types.h>
+#ifndef __ASSEMBLY__
+
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
+extern unsigned long phys_base;
+
+static inline unsigned long __phys_addr_nodebug(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
+
+ return x;
+}
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+extern unsigned long __phys_addr_symbol(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#define __phys_addr_symbol(x) \
+ ((unsigned long)(x) - __START_KERNEL_map + phys_base)
+#endif
+
+#define __phys_reloc_hide(x) (x)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_pfn)
+#endif
+
+void clear_page(void *page);
+void copy_page(void *to, void *from);
+
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 320f7bb95f76..8b491e66eaa8 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -50,26 +50,4 @@
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
-#ifndef __ASSEMBLY__
-void clear_page(void *page);
-void copy_page(void *to, void *from);
-
-/* duplicated to the one in bootmem.h */
-extern unsigned long max_pfn;
-extern unsigned long phys_base;
-
-extern unsigned long __phys_addr(unsigned long);
-#define __phys_reloc_hide(x) (x)
-
-#define vmemmap ((struct page *)VMEMMAP_START)
-
-extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
-extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-
-#endif /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_pfn)
-#endif
-
#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index e21fdd10479f..54c97879195e 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void)
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
}
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
+
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index c28fd02f4bf7..d9e9e6c7ed32 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -14,6 +14,9 @@
struct pci_sysdata {
int domain; /* PCI domain */
int node; /* NUMA node */
+#ifdef CONFIG_ACPI
+ void *acpi; /* ACPI-specific data */
+#endif
#ifdef CONFIG_X86_64
void *iommu; /* IOMMU private data */
#endif
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 747e5a38b590..fa1195dae425 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -54,7 +54,6 @@ void pcibios_set_cache_line_size(void);
/* pci-pc.c */
extern int pcibios_last_bus;
-extern struct pci_bus *pci_root_bus;
extern struct pci_ops pci_root_ops;
void pcibios_scan_specific_bus(int busn);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fc304279b559..1e672234c4ff 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -395,6 +395,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
+#include <linux/log2.h>
static inline int pte_none(pte_t pte)
{
@@ -620,6 +621,8 @@ static inline int pgd_none(pgd_t pgd)
#ifndef __ASSEMBLY__
extern int direct_gbpages;
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
@@ -786,6 +789,20 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
memcpy(dst, src, count * sizeof(pgd_t));
}
+#define PTE_SHIFT ilog2(PTRS_PER_PTE)
+static inline int page_level_shift(enum pg_level level)
+{
+ return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
+}
+static inline unsigned long page_level_size(enum pg_level level)
+{
+ return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(enum pg_level level)
+{
+ return ~(page_level_size(level) - 1);
+}
+
/*
* The x86 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 615b0c78449f..e22c1dbf7feb 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -180,6 +180,11 @@ extern void cleanup_highmap(void);
#define __HAVE_ARCH_PTE_SAME
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16fbbbd..2d883440cb9a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_PGTABLE_64_DEFS_H
#define _ASM_X86_PGTABLE_64_DEFS_H
+#include <asm/sparsemem.h>
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3c32db8c539d..567b5d0632b2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
-extern void native_pagetable_reserve(u64 start, u64 end);
#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
@@ -331,7 +330,7 @@ extern void native_pagetable_init(void);
struct seq_file;
extern void arch_report_meminfo(struct seq_file *m);
-enum {
+enum pg_level {
PG_LEVEL_NONE,
PG_LEVEL_4K,
PG_LEVEL_2M,
@@ -352,6 +351,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
* as a pte too.
*/
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
+extern phys_addr_t slow_virt_to_phys(void *__address);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cf500543f6ff..3270116b1488 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -89,7 +89,6 @@ struct cpuinfo_x86 {
char wp_works_ok; /* It doesn't on 386's */
/* Problems on some 486Dx4's and old 386's: */
- char hlt_works_ok;
char hard_math;
char rfu;
char fdiv_bug;
@@ -165,15 +164,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
extern const struct seq_operations cpuinfo_op;
-static inline int hlt_works(int cpu)
-{
-#ifdef CONFIG_X86_32
- return cpu_data(cpu).hlt_works_ok;
-#else
- return 1;
-#endif
-}
-
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
extern void cpu_detect(struct cpuinfo_x86 *c);
@@ -190,6 +180,14 @@ extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
extern void detect_extended_topology(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+#ifdef CONFIG_X86_32
+extern int have_cpuid_p(void);
+#else
+static inline int have_cpuid_p(void)
+{
+ return 1;
+}
+#endif
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
@@ -725,12 +723,13 @@ extern unsigned long boot_option_idle_override;
extern bool amd_e400_c1e_detected;
enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
- IDLE_POLL, IDLE_FORCE_MWAIT};
+ IDLE_POLL};
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
extern void early_trap_init(void);
+void early_trap_pf_init(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
@@ -998,7 +997,11 @@ extern unsigned long arch_align_stack(unsigned long sp);
extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
void default_idle(void);
-bool set_pm_idle_to_default(void);
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void);
+#else
+#define xen_set_default_idle 0
+#endif
void stop_this_cpu(void *dummy);
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6f414ed88620..6fd3fd769796 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,8 +5,6 @@
/* misc architecture specific prototypes */
-void early_idt_handler(void);
-
void system_call(void);
void syscall_init(void);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fe1ec5bcd846..9c6b890d5e7a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,7 @@ extern unsigned char boot_gdt[];
extern unsigned char secondary_startup_64[];
#endif
-extern void __init setup_real_mode(void);
+void reserve_real_mode(void);
+void setup_real_mode(void);
#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 216bf364a7e7..35e67a457182 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -31,27 +31,9 @@ typedef sigset_t compat_sigset_t;
#include <uapi/asm/signal.h>
#ifndef __ASSEMBLY__
extern void do_notify_resume(struct pt_regs *, void *, __u32);
-#ifdef __i386__
-struct old_sigaction {
- __sighandler_t sa_handler;
- old_sigset_t sa_mask;
- unsigned long sa_flags;
- __sigrestore_t sa_restorer;
-};
-
-struct sigaction {
- __sighandler_t sa_handler;
- unsigned long sa_flags;
- __sigrestore_t sa_restorer;
- sigset_t sa_mask; /* mask last for extensibility */
-};
-
-struct k_sigaction {
- struct sigaction sa;
-};
-#else /* __i386__ */
-#endif /* !__i386__ */
+#define __ARCH_HAS_SA_RESTORER
+
#include <asm/sigcontext.h>
#ifdef __i386__
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 31f61f96e0fb..0218d917f509 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -32,22 +32,11 @@ struct mmap_arg_struct32;
asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
-struct sigaction32;
-struct old_sigaction32;
-asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
- struct sigaction32 __user *, unsigned int);
-asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
- struct old_sigaction32 __user *);
asmlinkage long sys32_alarm(unsigned int);
asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int);
asmlinkage long sys32_sysfs(int, u32, u32);
-asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
- struct compat_timespec __user *);
-asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
-asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
-
asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
@@ -68,9 +57,8 @@ asmlinkage long sys32_fallocate(int, int, unsigned,
unsigned, unsigned, unsigned);
/* ia32/ia32_signal.c */
-asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
-asmlinkage long sys32_sigreturn(struct pt_regs *);
-asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
+asmlinkage long sys32_sigreturn(void);
+asmlinkage long sys32_rt_sigreturn(void);
/* ia32/ipc32.c */
asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 58b7e3eac0ae..6cf0a9cc60cd 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -18,13 +18,13 @@
/* Common in X86_32 and X86_64 */
/* kernel/ioport.c */
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
-long sys_iopl(unsigned int, struct pt_regs *);
+asmlinkage long sys_iopl(unsigned int);
/* kernel/ldt.c */
asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
/* kernel/signal.c */
-long sys_rt_sigreturn(struct pt_regs *);
+long sys_rt_sigreturn(void);
/* kernel/tls.c */
asmlinkage int sys_set_thread_area(struct user_desc __user *);
@@ -34,14 +34,11 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
#ifdef CONFIG_X86_32
/* kernel/signal.c */
-asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
-asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
- struct old_sigaction __user *);
-unsigned long sys_sigreturn(struct pt_regs *);
+unsigned long sys_sigreturn(void);
/* kernel/vm86_32.c */
-int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
-int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
+int sys_vm86old(struct vm86_struct __user *);
+int sys_vm86(unsigned long, unsigned long);
#else /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2d946e63ee82..2cd056e3ada3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,7 +20,6 @@
struct task_struct;
struct exec_domain;
#include <asm/processor.h>
-#include <asm/ftrace.h>
#include <linux/atomic.h>
struct thread_info {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0fee48e279cc..50a7fc0f824a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -20,10 +20,20 @@ static inline void __native_flush_tlb(void)
native_write_cr3(native_read_cr3());
}
+static inline void __native_flush_tlb_global_irq_disabled(void)
+{
+ unsigned long cr4;
+
+ cr4 = native_read_cr4();
+ /* clear PGE */
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
+}
+
static inline void __native_flush_tlb_global(void)
{
unsigned long flags;
- unsigned long cr4;
/*
* Read-modify-write to CR4 - protect it from preemption and
@@ -32,11 +42,7 @@ static inline void __native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = native_read_cr4();
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ __native_flush_tlb_global_irq_disabled();
raw_local_irq_restore(flags);
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1709801d18ec..5ee26875baea 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -125,13 +125,12 @@ extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
-#define __get_user_x(size, ret, x, ptr) \
- asm volatile("call __get_user_" #size \
- : "=a" (ret), "=d" (x) \
- : "0" (ptr)) \
-
-/* Careful: we have to cast the result to the type of the pointer
- * for sign reasons */
+/*
+ * This is a type: either unsigned long, if the argument fits into
+ * that type, or otherwise unsigned long long.
+ */
+#define __inttype(x) \
+__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
/**
* get_user: - Get a simple variable from user space.
@@ -150,38 +149,26 @@ extern int __get_user_bad(void);
* Returns zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
-#ifdef CONFIG_X86_32
-#define __get_user_8(__ret_gu, __val_gu, ptr) \
- __get_user_x(X, __ret_gu, __val_gu, ptr)
-#else
-#define __get_user_8(__ret_gu, __val_gu, ptr) \
- __get_user_x(8, __ret_gu, __val_gu, ptr)
-#endif
-
+/*
+ * Careful: we have to cast the result to the type of the pointer
+ * for sign reasons.
+ *
+ * The use of %edx as the register specifier is a bit of a
+ * simplification, as gcc only cares about it as the starting point
+ * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
+ * (%ecx being the next register in gcc's x86 register sequence), and
+ * %rdx on 64 bits.
+ */
#define get_user(x, ptr) \
({ \
int __ret_gu; \
- unsigned long __val_gu; \
+ register __inttype(*(ptr)) __val_gu asm("%edx"); \
__chk_user_ptr(ptr); \
might_fault(); \
- switch (sizeof(*(ptr))) { \
- case 1: \
- __get_user_x(1, __ret_gu, __val_gu, ptr); \
- break; \
- case 2: \
- __get_user_x(2, __ret_gu, __val_gu, ptr); \
- break; \
- case 4: \
- __get_user_x(4, __ret_gu, __val_gu, ptr); \
- break; \
- case 8: \
- __get_user_8(__ret_gu, __val_gu, ptr); \
- break; \
- default: \
- __get_user_x(X, __ret_gu, __val_gu, ptr); \
- break; \
- } \
- (x) = (__typeof__(*(ptr)))__val_gu; \
+ asm volatile("call __get_user_%P3" \
+ : "=a" (__ret_gu), "=r" (__val_gu) \
+ : "0" (ptr), "i" (sizeof(*(ptr)))); \
+ (x) = (__typeof__(*(ptr))) __val_gu; \
__ret_gu; \
})
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index a0790e07ba65..3d5df1c4447f 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -38,8 +38,6 @@
# define __ARCH_WANT_SYS_OLD_GETRLIMIT
# define __ARCH_WANT_SYS_OLD_UNAME
# define __ARCH_WANT_SYS_PAUSE
-# define __ARCH_WANT_SYS_RT_SIGACTION
-# define __ARCH_WANT_SYS_RT_SIGSUSPEND
# define __ARCH_WANT_SYS_SGETMASK
# define __ARCH_WANT_SYS_SIGNAL
# define __ARCH_WANT_SYS_SIGPENDING
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 235b49fa554b..b6fbf860e398 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -57,9 +57,12 @@
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_RDTSCP 0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
@@ -97,6 +100,7 @@ enum vmcs_field {
GUEST_GS_SELECTOR = 0x0000080a,
GUEST_LDTR_SELECTOR = 0x0000080c,
GUEST_TR_SELECTOR = 0x0000080e,
+ GUEST_INTR_STATUS = 0x00000810,
HOST_ES_SELECTOR = 0x00000c00,
HOST_CS_SELECTOR = 0x00000c02,
HOST_SS_SELECTOR = 0x00000c04,
@@ -124,6 +128,14 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH = 0x00002015,
EPT_POINTER = 0x0000201a,
EPT_POINTER_HIGH = 0x0000201b,
+ EOI_EXIT_BITMAP0 = 0x0000201c,
+ EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ EOI_EXIT_BITMAP1 = 0x0000201e,
+ EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ EOI_EXIT_BITMAP2 = 0x00002020,
+ EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ EOI_EXIT_BITMAP3 = 0x00002022,
+ EOI_EXIT_BITMAP3_HIGH = 0x00002023,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
@@ -346,9 +358,9 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
-#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
+#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
#define VMX_NR_VPIDS (1 << 16)
#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 7669941cc9d2..d8d99222b36a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -69,17 +69,6 @@ struct x86_init_oem {
};
/**
- * struct x86_init_mapping - platform specific initial kernel pagetable setup
- * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
- *
- * For more details on the purpose of this hook, look in
- * init_memory_mapping and the commit that added it.
- */
-struct x86_init_mapping {
- void (*pagetable_reserve)(u64 start, u64 end);
-};
-
-/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_init: platform specific paging initialization call to setup
* the kernel pagetables and prepare accessors functions.
@@ -136,7 +125,6 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
- struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index cc146d51449e..ca842f2769ef 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -16,4 +16,7 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
return raw_irqs_disabled_flags(regs->flags);
}
+/* No need for a barrier -- XCHG is a barrier on x86. */
+#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 472b9b783019..6aef9fbc09b7 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -212,4 +212,6 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
+#define xen_remap(cookie, size) ioremap((cookie), (size));
+
#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 075a40255591..892ce40a7470 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -103,6 +103,8 @@
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define MSR_IA32_POWER_CTL 0x000001fc
+
#define MSR_IA32_MC0_CTL 0x00000400
#define MSR_IA32_MC0_STATUS 0x00000401
#define MSR_IA32_MC0_ADDR 0x00000402
@@ -173,6 +175,7 @@
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
#define MSR_AMD64_OSVW_STATUS 0xc0010141
#define MSR_AMD64_DC_CFG 0xc0011022
+#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
@@ -274,6 +277,7 @@
#define MSR_IA32_PLATFORM_ID 0x00000017
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
#define MSR_EBC_FREQUENCY_ID 0x0000002c
+#define MSR_SMI_COUNT 0x00000034
#define MSR_IA32_FEATURE_CONTROL 0x0000003a
#define MSR_IA32_TSC_ADJUST 0x0000003b
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index aa7d6ae39e0e..8264f47cf53e 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -95,9 +95,9 @@ typedef unsigned long sigset_t;
#ifndef __ASSEMBLY__
-#ifdef __i386__
# ifndef __KERNEL__
/* Here we must cater to libcs that poke about in kernel headers. */
+#ifdef __i386__
struct sigaction {
union {
@@ -112,7 +112,6 @@ struct sigaction {
#define sa_handler _u._sa_handler
#define sa_sigaction _u._sa_sigaction
-# endif /* ! __KERNEL__ */
#else /* __i386__ */
struct sigaction {
@@ -122,11 +121,8 @@ struct sigaction {
sigset_t sa_mask; /* mask last for extensibility */
};
-struct k_sigaction {
- struct sigaction sa;
-};
-
#endif /* !__i386__ */
+# endif /* ! __KERNEL__ */
typedef struct sigaltstack {
void __user *ss_sp;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 979d03bce135..2871fccfee68 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -62,10 +62,12 @@
#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_INVPCID 58
#define VMX_EXIT_REASONS \
@@ -103,7 +105,12 @@
{ EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
{ EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
{ EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
- { EXIT_REASON_WBINVD, "WBINVD" }
+ { EXIT_REASON_WBINVD, "WBINVD" }, \
+ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
+ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
+ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVPCID, "INVPCID" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index ac3b3d002833..7bd3bd310106 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,6 +87,9 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
+obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o
+obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o
+obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o
microcode-y := microcode_core.o
microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f4..230c8ea878e5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
# include <asm/proto.h>
-# include <asm/numa_64.h>
#endif /* X86 */
#define BAD_MADT_ENTRY(entry, end) ( \
@@ -697,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic);
int acpi_unmap_lsapic(int cpu)
{
+#ifdef CONFIG_ACPI_NUMA
+ set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+
per_cpu(x86_cpu_to_apicid, cpu) = -1;
set_cpu_present(cpu, false);
num_processors--;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d717005a..0532f5d6e4ef 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void)
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)__pa(&initial_page_table);
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
aper_base = info.aper_base;
end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 9c2aa89a11cb..9a9110918ca7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -28,6 +28,7 @@
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/apic_flat_64.h>
+#include <asm/pgtable.h>
static int numachip_system __read_mostly;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 8d7012b7f402..66b5faffe14a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -232,6 +232,7 @@
#include <linux/acpi.h>
#include <linux/syscore_ops.h>
#include <linux/i8253.h>
+#include <linux/cpuidle.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
@@ -360,13 +361,35 @@ struct apm_user {
* idle percentage above which bios idle calls are done
*/
#ifdef CONFIG_APM_CPU_IDLE
-#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
#define DEFAULT_IDLE_THRESHOLD 95
#else
#define DEFAULT_IDLE_THRESHOLD 100
#endif
#define DEFAULT_IDLE_PERIOD (100 / 3)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
+
+static struct cpuidle_driver apm_idle_driver = {
+ .name = "apm_idle",
+ .owner = THIS_MODULE,
+ .en_core_tk_irqen = 1,
+ .states = {
+ { /* entry 0 is for polling */ },
+ { /* entry 1 is for APM idle */
+ .name = "APM",
+ .desc = "APM idle",
+ .flags = CPUIDLE_FLAG_TIME_VALID,
+ .exit_latency = 250, /* WAG */
+ .target_residency = 500, /* WAG */
+ .enter = &apm_cpu_idle
+ },
+ },
+ .state_count = 2,
+};
+
+static struct cpuidle_device apm_cpuidle_device;
+
/*
* Local variables
*/
@@ -377,7 +400,6 @@ static struct {
static int clock_slowed;
static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int set_pm_idle;
static int suspends_pending;
static int standbys_pending;
static int ignore_sys_suspend;
@@ -884,8 +906,6 @@ static void apm_do_busy(void)
#define IDLE_CALC_LIMIT (HZ * 100)
#define IDLE_LEAKY_MAX 16
-static void (*original_pm_idle)(void) __read_mostly;
-
/**
* apm_cpu_idle - cpu idling for APM capable Linux
*
@@ -894,7 +914,8 @@ static void (*original_pm_idle)(void) __read_mostly;
* Furthermore it calls the system default idle routine.
*/
-static void apm_cpu_idle(void)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
@@ -905,7 +926,6 @@ static void apm_cpu_idle(void)
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
- WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
recalc:
task_cputime(current, NULL, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
@@ -951,10 +971,7 @@ recalc:
break;
}
}
- if (original_pm_idle)
- original_pm_idle();
- else
- default_idle();
+ default_idle();
local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
@@ -964,7 +981,7 @@ recalc:
if (apm_idle_done)
apm_do_busy();
- local_irq_enable();
+ return index;
}
/**
@@ -2382,9 +2399,9 @@ static int __init apm_init(void)
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
- original_pm_idle = pm_idle;
- pm_idle = apm_cpu_idle;
- set_pm_idle = 1;
+ if (!cpuidle_register_driver(&apm_idle_driver))
+ if (cpuidle_register_device(&apm_cpuidle_device))
+ cpuidle_unregister_driver(&apm_idle_driver);
}
return 0;
@@ -2394,15 +2411,9 @@ static void __exit apm_exit(void)
{
int error;
- if (set_pm_idle) {
- pm_idle = original_pm_idle;
- /*
- * We are about to unload the current idle thread pm callback
- * (pm_idle), Wait for all processors to update cached/local
- * copies of pm_idle before proceeding.
- */
- kick_all_cpus_sync();
- }
+ cpuidle_unregister_device(&apm_cpuidle_device);
+ cpuidle_unregister_driver(&apm_idle_driver);
+
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
error = apm_engage_power_management(APM_DEVICE_ALL, 0);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 782c456eaa01..fa96eb0d02fb 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
# include <asm/mmconfig.h>
# include <asm/cacheflush.h>
#endif
@@ -220,8 +219,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
- if (!test_taint(TAINT_UNSAFE_SMP))
- add_taint(TAINT_UNSAFE_SMP);
+ add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE);
valid_k7:
;
@@ -518,10 +516,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
u32 dummy;
-
-#ifdef CONFIG_SMP
unsigned long long value;
+#ifdef CONFIG_SMP
/*
* Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
@@ -559,12 +556,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* (AMD Erratum #110, docId: 25759).
*/
if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- u64 val;
-
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &val)) {
- val &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, val);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~(1ULL << 32);
+ wrmsrl_amd_safe(0xc001100d, value);
}
}
@@ -617,13 +612,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if ((c->x86 == 0x15) &&
(c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
- u64 val;
- if (!rdmsrl_safe(0xc0011005, &val)) {
- val |= 1ULL << 54;
- wrmsrl_safe(0xc0011005, val);
- rdmsrl(0xc0011005, val);
- if (val & (1ULL << 54)) {
+ if (!rdmsrl_safe(0xc0011005, &value)) {
+ value |= 1ULL << 54;
+ wrmsrl_safe(0xc0011005, value);
+ rdmsrl(0xc0011005, value);
+ if (value & (1ULL << 54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
printk(KERN_INFO FW_INFO "CPU: Re-enabling "
"disabled Topology Extensions Support\n");
@@ -637,11 +631,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
*/
if ((c->x86 == 0x15) &&
(c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
- u64 val;
- if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
- val |= 0x1E;
- wrmsrl_safe(0xc0011021, val);
+ if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
+ value |= 0x1E;
+ wrmsrl_safe(0xc0011021, value);
}
}
@@ -685,12 +678,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* benefit in doing so.
*/
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if ((tseg>>PMD_SHIFT) <
- (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- ((tseg>>PMD_SHIFT) <
- (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+ if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
@@ -703,13 +694,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- */
if (c->x86 == 0x10) {
/*
+ * Disable GART TLB Walk Errors on Fam10h. We do this here
+ * because this is always needed when GART is enabled, even in a
+ * kernel which has no MCE support built in.
* BIOS should disable GartTlbWlk Errors themself. If
* it doesn't do it here as suggested by the BKDG.
*
@@ -723,6 +712,21 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
mask |= (1 << 10);
wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
}
+
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support,
+ * causing it to be converted to CD memtype. This may result in
+ * performance degradation for certain nested-paging guests.
+ * Prevent this conversion by clearing bit 24 in
+ * MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+
+ rdmsrl_safe(MSR_AMD64_BU_CFG2, &value);
+ value &= ~(1ULL << 24);
+ wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
}
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 92dfec986a48..af6455e3fcc9 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -17,15 +17,6 @@
#include <asm/paravirt.h>
#include <asm/alternative.h>
-static int __init no_halt(char *s)
-{
- WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
- boot_cpu_data.hlt_works_ok = 0;
- return 1;
-}
-
-__setup("no-hlt", no_halt);
-
static int __init no_387(char *s)
{
boot_cpu_data.hard_math = 0;
@@ -89,23 +80,6 @@ static void __init check_fpu(void)
pr_warn("Hmm, FPU with FDIV bug\n");
}
-static void __init check_hlt(void)
-{
- if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
- return;
-
- pr_info("Checking 'hlt' instruction... ");
- if (!boot_cpu_data.hlt_works_ok) {
- pr_cont("disabled\n");
- return;
- }
- halt();
- halt();
- halt();
- halt();
- pr_cont("OK\n");
-}
-
/*
* Check whether we are able to run this kernel safely on SMP.
*
@@ -129,7 +103,6 @@ void __init check_bugs(void)
print_cpu_info(&boot_cpu_data);
#endif
check_config();
- check_hlt();
init_utsname()->machine[1] =
'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9c3ab43a6954..d814772c5bed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -37,6 +37,8 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/pat.h>
+#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
@@ -213,7 +215,7 @@ static inline int flag_is_changeable_p(u32 flag)
}
/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
+int __cpuinit have_cpuid_p(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
@@ -249,11 +251,6 @@ static inline int flag_is_changeable_p(u32 flag)
{
return 1;
}
-/* Probe for the CPUID instruction */
-static inline int have_cpuid_p(void)
-{
- return 1;
-}
static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
}
@@ -1223,6 +1220,12 @@ void __cpuinit cpu_init(void)
int cpu;
int i;
+ /*
+ * Load microcode on this cpu if a valid microcode is available.
+ * This is early microcode loading procedure.
+ */
+ load_ucode_ap();
+
cpu = stack_smp_processor_id();
t = &per_cpu(init_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
@@ -1314,6 +1317,8 @@ void __cpuinit cpu_init(void)
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = &curr->thread;
+ show_ucode_info_early();
+
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
for (;;)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcaabd0432c5..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
#ifdef CONFIG_X86_64
#include <linux/topology.h>
-#include <asm/numa_64.h>
#endif
#include "cpu.h"
@@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void)
#ifdef CONFIG_X86_F00F_BUG
static void __cpuinit trap_init_f00f_bug(void)
{
- __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+ __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
/*
* Update the IDT descriptor and reload the IDT so that
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index fc7608a89d93..7bc126346ace 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1082,7 +1082,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/*
* Set taint even when machine check was not enabled.
*/
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
severity = mce_severity(&m, cfg->tolerant, NULL);
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 2d5454cd2c4f..1c044b1ccc59 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -33,7 +33,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
smp_processor_id());
}
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2d7998fb628c..e9a701aecaa1 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index e9fe907cd249..fa72a39e5d46 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -542,7 +542,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
if (tmp != mask_lo) {
printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
- add_taint(TAINT_FIRMWARE_WORKAROUND);
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
mask_lo = tmp;
}
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 6336bcbd0618..5f0581e713c2 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -528,7 +528,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
if (!test_bit(IBS_STARTED, pcpu->state)) {
/*
* Catch spurious interrupts after stopping IBS: After
- * disabling IBS there could be still incomming NMIs
+ * disabling IBS there could be still incoming NMIs
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4914e94ad6e8..529c8931fc02 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -107,6 +107,27 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
{
INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
@@ -2095,7 +2116,7 @@ __init int intel_pmu_init(void)
intel_pmu_lbr_init_snb();
- x86_pmu.event_constraints = intel_snb_event_constraints;
+ x86_pmu.event_constraints = intel_ivb_event_constraints;
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
x86_pmu.extra_regs = intel_snb_extra_regs;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 3286a92e662a..e280253f6f94 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -28,7 +28,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
seq_printf(m,
"fdiv_bug\t: %s\n"
- "hlt_bug\t\t: %s\n"
"f00f_bug\t: %s\n"
"coma_bug\t: %s\n"
"fpu\t\t: %s\n"
@@ -36,7 +35,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"cpuid level\t: %d\n"
"wp\t\t: %s\n",
c->fdiv_bug ? "yes" : "no",
- c->hlt_works_ok ? "no" : "yes",
c->f00f_bug ? "yes" : "no",
c->coma_bug ? "yes" : "no",
c->hard_math ? "yes" : "no",
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ae42418bc50f..c8797d55b245 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -232,7 +232,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
bust_spinlocks(0);
die_owner = -1;
- add_taint(TAINT_DIE);
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
die_nest_count--;
if (!die_nest_count)
/* Nest count reaches zero, release the lock. */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
}
early_param("mem", parse_memopt);
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
{
char *oldp;
u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
return *p == '\0' ? 0 : -EINVAL;
}
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8831176aa5ef..8f3e2dec1df3 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -699,51 +699,6 @@ END(syscall_badsys)
*/
.popsection
-/*
- * System calls that need a pt_regs pointer.
- */
-#define PTREGSCALL0(name) \
-ENTRY(ptregs_##name) ; \
- leal 4(%esp),%eax; \
- jmp sys_##name; \
-ENDPROC(ptregs_##name)
-
-#define PTREGSCALL1(name) \
-ENTRY(ptregs_##name) ; \
- leal 4(%esp),%edx; \
- movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name; \
-ENDPROC(ptregs_##name)
-
-#define PTREGSCALL2(name) \
-ENTRY(ptregs_##name) ; \
- leal 4(%esp),%ecx; \
- movl (PT_ECX+4)(%esp),%edx; \
- movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name; \
-ENDPROC(ptregs_##name)
-
-#define PTREGSCALL3(name) \
-ENTRY(ptregs_##name) ; \
- CFI_STARTPROC; \
- leal 4(%esp),%eax; \
- pushl_cfi %eax; \
- movl PT_EDX(%eax),%ecx; \
- movl PT_ECX(%eax),%edx; \
- movl PT_EBX(%eax),%eax; \
- call sys_##name; \
- addl $4,%esp; \
- CFI_ADJUST_CFA_OFFSET -4; \
- ret; \
- CFI_ENDPROC; \
-ENDPROC(ptregs_##name)
-
-PTREGSCALL1(iopl)
-PTREGSCALL0(sigreturn)
-PTREGSCALL0(rt_sigreturn)
-PTREGSCALL2(vm86)
-PTREGSCALL1(vm86old)
-
.macro FIXUP_ESPFIX_STACK
/*
* Switch back for ESPFIX stack to the normal zerobased stack
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 048f2240f8e6..c1d01e6ca790 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -828,23 +828,6 @@ int_restore_rest:
CFI_ENDPROC
END(system_call)
-/*
- * Certain special system calls that need to save a complete full stack frame.
- */
- .macro PTREGSCALL label,func,arg
-ENTRY(\label)
- PARTIAL_FRAME 1 8 /* offset 8: return address */
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- call save_rest
- DEFAULT_FRAME 0 8 /* offset 8: return address */
- leaq 8(%rsp), \arg /* pt_regs pointer */
- call \func
- jmp ptregscall_common
- CFI_ENDPROC
-END(\label)
- .endm
-
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
@@ -861,10 +844,22 @@ ENTRY(stub_\func)
END(stub_\func)
.endm
+ .macro FIXED_FRAME label,func
+ENTRY(\label)
+ CFI_STARTPROC
+ PARTIAL_FRAME 0 8 /* offset 8: return address */
+ FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
+ call \func
+ RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
+ ret
+ CFI_ENDPROC
+END(\label)
+ .endm
+
FORK_LIKE clone
FORK_LIKE fork
FORK_LIKE vfork
- PTREGSCALL stub_iopl, sys_iopl, %rsi
+ FIXED_FRAME stub_iopl, sys_iopl
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
@@ -886,7 +881,6 @@ ENTRY(stub_execve)
SAVE_REST
FIXUP_TOP_OF_STACK %r11
call sys_execve
- RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
@@ -902,7 +896,6 @@ ENTRY(stub_rt_sigreturn)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
@@ -917,7 +910,6 @@ ENTRY(stub_x32_rt_sigreturn)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
call sys32_x32_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d414029f1d8..42a392a9fd02 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa(ip));
+ ip = (unsigned long)__va(__pa_symbol(ip));
return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
@@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa(ip));
+ ip = (unsigned long)__va(__pa_symbol(ip));
return probe_kernel_write((void *)ip, val, size);
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 6773c918b8cc..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,20 +33,6 @@ void __init i386_start_kernel(void)
{
sanitize_boot_params(&boot_params);
- memblock_reserve(__pa_symbol(&_text),
- __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
- }
-#endif
-
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_MRST:
@@ -60,11 +46,5 @@ void __init i386_start_kernel(void)
break;
}
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 849fc9e63c2f..c5e403f6d869 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,12 +26,83 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>
#include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ unsigned long i;
+
+ for (i = 0; i < PTRS_PER_PGD-1; i++)
+ early_level4_pgt[i].pgd = 0;
+
+ next_early_pgt = 0;
+
+ write_cr3(__pa(early_level4_pgt));
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ unsigned long i;
+ pgdval_t pgd, *pgd_p;
+ pudval_t pud, *pud_p;
+ pmdval_t pmd, *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ return -1;
+
+again:
+ pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (pgd)
+ pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_p[i] = 0;
+ *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_p[i] = 0;
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+ pmd_p[pmd_index(address)] = pmd;
+
+ return 0;
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -42,14 +113,25 @@ static void __init clear_bss(void)
(unsigned long) __bss_stop - (unsigned long) __bss_start);
}
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+
static void __init copy_bootdata(char *real_mode_data)
{
char * command_line;
+ unsigned long cmd_line_ptr;
memcpy(&boot_params, real_mode_data, sizeof boot_params);
sanitize_boot_params(&boot_params);
- if (boot_params.hdr.cmd_line_ptr) {
- command_line = __va(boot_params.hdr.cmd_line_ptr);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
}
@@ -72,54 +154,40 @@ void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
- /* Make NULL pointers segfault */
- zap_identity_mappings();
-
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
-
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
-#ifdef CONFIG_EARLY_PRINTK
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, &early_idt_handlers[i]);
-#else
- set_intr_gate(i, early_idt_handler);
-#endif
- }
load_idt((const struct desc_ptr *)&idt_descr);
+ copy_bootdata(__va(real_mode_data));
+
+ /*
+ * Load microcode early on BSP.
+ */
+ load_ucode_bsp();
+
if (console_loglevel == 10)
early_printk("Kernel alive\n");
+ clear_page(init_level4_pgt);
+ /* set init_level4_pgt kernel high mapping*/
+ init_level4_pgt[511] = early_level4_pgt[511];
+
x86_64_start_reservations(real_mode_data);
}
void __init x86_64_start_reservations(char *real_mode_data)
{
- copy_bootdata(__va(real_mode_data));
-
- memblock_reserve(__pa_symbol(&_text),
- __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
- }
-#endif
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
reserve_ebda_region();
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 3c3f58a0808f..73afd11799ca 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -144,6 +144,11 @@ ENTRY(startup_32)
movl %eax, pa(olpc_ofw_pgd)
#endif
+#ifdef CONFIG_MICROCODE_EARLY
+ /* Early load ucode on BSP. */
+ call load_ucode_bsp
+#endif
+
/*
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond __brk_base. The variable
@@ -299,6 +304,12 @@ ENTRY(startup_32_smp)
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
+#ifdef CONFIG_MICROCODE_EARLY
+ /* Early load ucode on AP. */
+ call load_ucode_ap
+#endif
+
+
default_entry:
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..b7de3b25adb5 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
-
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
* tables and then reload them.
*/
- /* Compute the delta between the address I am compiled to run at and the
+ /*
+ * Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
testl %eax, %eax
jnz bad_address
- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
+ /*
+ * Is the address too large?
*/
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ leaq _text(%rip), %rax
+ shrq $MAX_PHYSMEM_BITS, %rax
+ jnz bad_address
- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Fixup the physical addresses in the page table
+ */
+ addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)
addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
- /* Add an Identity mapping if I am above 1G */
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+ leaq early_level4_pgt(%rip), %rbx
movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+ shrq $PGDIR_SHIFT, %rax
- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ movq %rdx, 0(%rbx,%rax,8)
+ movq %rdx, 8(%rbx,%rax,8)
+ addq $4096, %rdx
movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ shrq $PUD_SHIFT, %rax
+ andl $(PTRS_PER_PUD-1), %eax
+ movq %rdx, (4096+0)(%rbx,%rax,8)
+ movq %rdx, (4096+8)(%rbx,%rax,8)
+
+ addq $8192, %rbx
+ movq %rdi, %rax
+ shrq $PMD_SHIFT, %rdi
+ addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+ leaq (_end - 1)(%rip), %rcx
+ shrq $PMD_SHIFT, %rcx
+ subq %rdi, %rcx
+ incl %ecx
+
+1:
+ andq $(PTRS_PER_PMD - 1), %rdi
+ movq %rax, (%rbx,%rdi,8)
+ incq %rdi
+ addq $PMD_SIZE, %rax
+ decl %ecx
+ jnz 1b
/*
* Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
-
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
- /* Due to ENTRY(), sometimes the empty space gets filled with
- * zeros. Better take a jmp than relying on empty space being
- * filled with 0x90 (nop)
- */
- jmp secondary_startup_64
+ movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ jmp 1f
ENTRY(secondary_startup_64)
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax
+1:
+
/* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
- movq %rax, %cr4
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0
/* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ movq stack_start(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
movl initial_gs+4(%rip),%edx
wrmsr
- /* esi is pointer to real mode structure with interesting info.
+ /* rsi is pointer to real mode structure with interesting info.
pass it to C */
- movl %esi, %edi
+ movq %rsi, %rdi
/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
* a far return.
+ *
+ * Note: do not change to far jump indirect with 64bit offset.
+ *
+ * AMD does not support far jump indirect with 64bit offset.
+ * AMD64 Architecture Programmer's Manual, Volume 3: states only
+ * JMP FAR mem16:16 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ * JMP FAR mem16:32 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ *
+ * Intel64 does support 64bit offset.
+ * Software Developer Manual Vol 2: states:
+ * FF /5 JMP m16:16 Jump far, absolute indirect,
+ * address given in m16:16
+ * FF /5 JMP m16:32 Jump far, absolute indirect,
+ * address given in m16:32.
+ * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
+ * address given in m16:64.
*/
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
/* SMP bootup changes these two */
__REFDATA
- .align 8
- ENTRY(initial_code)
+ .balign 8
+ GLOBAL(initial_code)
.quad x86_64_start_kernel
- ENTRY(initial_gs)
+ GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
- ENTRY(stack_start)
+ GLOBAL(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
__FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
bad_address:
jmp bad_address
- .section ".init.text","ax"
+ __INIT
.globl early_idt_handlers
early_idt_handlers:
# 104(%rsp) %rflags
@@ -303,6 +336,7 @@ early_idt_handlers:
i = i + 1
.endr
+/* This is global to keep gas from relaxing the jumps */
ENTRY(early_idt_handler)
cld
@@ -321,14 +355,22 @@ ENTRY(early_idt_handler)
pushq %r11 # 0(%rsp)
cmpl $__KERNEL_CS,96(%rsp)
- jne 10f
+ jne 11f
+
+ cmpl $14,72(%rsp) # Page fault?
+ jnz 10f
+ GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f # All good
+10:
leaq 88(%rsp),%rdi # Pointer to %rip
call early_fixup_exception
andl %eax,%eax
jnz 20f # Found an exception entry
-10:
+11:
#ifdef CONFIG_EARLY_PRINTK
GET_CR2_INTO(%r9) # can clobber any volatile register if pv
movl 80(%rsp),%r8d # error code
@@ -350,7 +392,7 @@ ENTRY(early_idt_handler)
1: hlt
jmp 1b
-20: # Exception table entry found
+20: # Exception table entry found or page table generated
popq %r11
popq %r10
popq %r9
@@ -363,6 +405,9 @@ ENTRY(early_idt_handler)
addq $16,%rsp # drop vector number and error code
decl early_recursion_flag(%rip)
INTERRUPT_RETURN
+ENDPROC(early_idt_handler)
+
+ __INITDATA
.balign 4
early_recursion_flag:
@@ -374,11 +419,10 @@ early_idt_msg:
early_idt_ripmsg:
.asciz "RIP %s\n"
#endif /* CONFIG_EARLY_PRINTK */
- .previous
#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
@@ -388,24 +432,37 @@ ENTRY(name)
i = i + 1 ; \
.endr
+ __INITDATA
+NEXT_PAGE(early_level4_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+
.data
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
+
+#ifndef CONFIG_XEN
NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .fill 512,8,0
+#else
+NEXT_PAGE(init_level4_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 511,8,0
+ .fill 511, 8, 0
+NEXT_PAGE(level2_ident_pgt)
+ /* Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#endif
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
@@ -413,21 +470,6 @@ NEXT_PAGE(level3_kernel_pgt)
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
- * Don't set NX because code runs from these pages.
- */
- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
NEXT_PAGE(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +484,16 @@ NEXT_PAGE(level2_kernel_pgt)
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)
-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 506,8,0
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+ .fill 5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .fill 512,8,0
#undef PMDS
-#undef NEXT_PAGE
.data
.align 16
@@ -472,6 +519,5 @@ ENTRY(nmi_idt_table)
.skip IDT_ENTRIES * 16
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 9c3bd4a2050e..0fa69127209a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
EXPORT_SYMBOL(__put_user_1);
EXPORT_SYMBOL(__put_user_2);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8c968974253d..4ddaf66ea35f 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -93,8 +93,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* on system-call entry - see also fork() and the signal handling
* code.
*/
-long sys_iopl(unsigned int level, struct pt_regs *regs)
+SYSCALL_DEFINE1(iopl, unsigned int, level)
{
+ struct pt_regs *regs = current_pt_regs();
unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 2b44ea5f269d..b686a904d7c3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -297,9 +297,9 @@ static void kvm_register_steal_time(void)
memset(st, 0, sizeof(*st));
- wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
- printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
- cpu, __pa(st));
+ wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_info("kvm-stealtime: cpu %d, msr %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
}
static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void)
return;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa = __pa(&__get_cpu_var(apf_reason));
+ u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void)
/* Size alignment is implied but just to make it explicit. */
BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
__get_cpu_var(kvm_apic_eoi) = 0;
- pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
+ pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
wrmsrl(MSR_KVM_PV_EOI_EN, pa);
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 220a360010f8..0732f0089a3d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -162,8 +162,8 @@ int kvm_register_clock(char *txt)
int low, high, ret;
struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
- low = (int)__pa(src) | 1;
- high = ((u64)__pa(src) >> 32);
+ low = (int)slow_virt_to_phys(src) | 1;
+ high = ((u64)slow_virt_to_phys(src) >> 32);
ret = native_write_msr_safe(msr_kvm_system_time, low, high);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
@@ -218,6 +218,9 @@ static void kvm_shutdown(void)
void __init kvmclock_init(void)
{
unsigned long mem;
+ int size;
+
+ size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
if (!kvm_para_available())
return;
@@ -231,16 +234,14 @@ void __init kvmclock_init(void)
printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
- mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
- PAGE_SIZE);
+ mem = memblock_alloc(size, PAGE_SIZE);
if (!mem)
return;
hv_clock = __va(mem);
if (kvm_register_clock("boot clock")) {
hv_clock = NULL;
- memblock_free(mem,
- sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
+ memblock_free(mem, size);
return;
}
pv_time_ops.sched_clock = kvm_clock_read;
@@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
struct pvclock_vcpu_time_info *vcpu_time;
unsigned int size;
- size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+ size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
preempt_disable();
cpu = smp_processor_id();
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
#include <linux/io.h>
#include <linux/suspend.h>
+#include <asm/init.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
- unsigned long addr)
-{
- pud_t *pud;
- pmd_t *pmd;
- struct page *page;
- int result = -ENOMEM;
-
- addr &= PMD_MASK;
- pgd += pgd_index(addr);
- if (!pgd_present(*pgd)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pud = (pud_t *)page_address(page);
- clear_page(pud);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pmd = (pmd_t *)page_address(page);
- clear_page(pmd);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- result = 0;
-out:
- return result;
-}
-
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
-{
- unsigned long end_addr;
-
- addr &= PAGE_MASK;
- end_addr = addr + PUD_SIZE;
- while (addr < end_addr) {
- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- addr += PMD_SIZE;
- }
-}
-
-static int init_level3_page(struct kimage *image, pud_t *level3p,
- unsigned long addr, unsigned long last_addr)
-{
- unsigned long end_addr;
- int result;
-
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + PGDIR_SIZE;
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pmd_t *level2p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level2p = (pmd_t *)page_address(page);
- init_level2_page(level2p, addr);
- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
- addr += PUD_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pud_clear(level3p++);
- addr += PUD_SIZE;
- }
-out:
- return result;
-}
-
-
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
- unsigned long addr, unsigned long last_addr)
-{
- unsigned long end_addr;
- int result;
-
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pud_t *level3p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level3p = (pud_t *)page_address(page);
- result = init_level3_page(image, level3p, addr, last_addr);
- if (result)
- goto out;
- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
- addr += PGDIR_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pgd_clear(level4p++);
- addr += PGDIR_SIZE;
- }
-out:
- return result;
-}
-
static void free_transition_pgtable(struct kimage *image)
{
free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
return result;
}
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
+
+ return p;
+}
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ };
+ unsigned long mstart, mend;
pgd_t *level4p;
int result;
+ int i;
+
level4p = (pgd_t *)__va(start_pgtable);
- result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
- if (result)
- return result;
+ clear_page(level4p);
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+ if (result)
+ return result;
+ }
+
/*
- * image->start may be outside 0 ~ max_pfn, for example when
- * jump back to original kernel from kexeced kernel
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
*/
- result = init_one_level2_page(image, level4p, image->start);
- if (result)
- return result;
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+
+ if (result)
+ return result;
+ }
+
return init_transition_pgtable(image, level4p);
}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 3a04b224d0c0..22db92bbdf1a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -364,10 +364,7 @@ static struct attribute_group mc_attr_group = {
static void microcode_fini_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
microcode_ops->microcode_fini_cpu(cpu);
- uci->valid = 0;
}
static enum ucode_state microcode_resume_cpu(int cpu)
@@ -383,6 +380,10 @@ static enum ucode_state microcode_resume_cpu(int cpu)
static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
{
enum ucode_state ustate;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci && uci->valid)
+ return UCODE_OK;
if (collect_cpu_info(cpu))
return UCODE_ERROR;
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c
new file mode 100644
index 000000000000..577db8417d15
--- /dev/null
+++ b/arch/x86/kernel/microcode_core_early.c
@@ -0,0 +1,76 @@
+/*
+ * X86 CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This driver allows to early upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ * Software Developer's Manual.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly through cpuid.
+ */
+static int __cpuinit x86_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ if (!have_cpuid_p())
+ return X86_VENDOR_UNKNOWN;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+void __init load_ucode_bsp(void)
+{
+ int vendor = x86_vendor();
+
+ if (vendor == X86_VENDOR_INTEL)
+ load_ucode_intel_bsp();
+}
+
+void __cpuinit load_ucode_ap(void)
+{
+ int vendor = x86_vendor();
+
+ if (vendor == X86_VENDOR_INTEL)
+ load_ucode_intel_ap();
+}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 3544aed39338..5fb2cebf556b 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -79,7 +79,7 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
-#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -87,59 +87,6 @@ MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
-struct microcode_header_intel {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int datasize;
- unsigned int totalsize;
- unsigned int reserved[3];
-};
-
-struct microcode_intel {
- struct microcode_header_intel hdr;
- unsigned int bits[0];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
- unsigned int sig;
- unsigned int pf;
- unsigned int cksum;
-};
-
-struct extended_sigtable {
- unsigned int count;
- unsigned int cksum;
- unsigned int reserved[3];
- struct extended_signature sigs[0];
-};
-
-#define DEFAULT_UCODE_DATASIZE (2000)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
-#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
-#define DWSIZE (sizeof(u32))
-
-#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.totalsize ? \
- ((struct microcode_intel *)mc)->hdr.totalsize : \
- DEFAULT_UCODE_TOTALSIZE)
-
-#define get_datasize(mc) \
- (((struct microcode_intel *)mc)->hdr.datasize ? \
- ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
@@ -162,128 +109,25 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
return 0;
}
-static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
-{
- return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
-}
-
-static inline int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- int sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- pr_err("error! Bad data size in microcode data file\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- pr_err("error! Unknown microcode update format\n");
- return -EINVAL;
- }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- pr_err("error! Small exttable size in microcode data file\n");
- return -EINVAL;
- }
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- pr_err("error! Bad exttable size in microcode data file\n");
- return -EFAULT;
- }
- ext_sigcount = ext_header->count;
- }
-
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int *ext_tablep = (int *)ext_header;
-
- i = ext_table_size / DWSIZE;
- while (i--)
- ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- pr_warning("aborting, bad extended signature table checksum\n");
- return -EINVAL;
- }
- }
-
- /* calculate the checksum */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--)
- orig_sum += ((int *)mc)[i];
- if (orig_sum) {
- pr_err("aborting, bad checksum\n");
- return -EINVAL;
- }
- if (!ext_table_size)
- return 0;
- /* check extended signature checksum */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
- sum = orig_sum
- - (mc_header->sig + mc_header->pf + mc_header->cksum)
- + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- pr_err("aborting, bad checksum\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
/*
* return 0 - no update found
* return 1 - found update
*/
-static int
-get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
+static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
{
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
-
- if (!update_match_revision(mc_header, rev))
- return 0;
-
- if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
- return 1;
+ struct cpu_signature cpu_sig;
+ unsigned int csig, cpf, crev;
- /* Look for ext. headers: */
- if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
- return 0;
+ collect_cpu_info(cpu, &cpu_sig);
- ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ csig = cpu_sig.sig;
+ cpf = cpu_sig.pf;
+ crev = cpu_sig.rev;
- for (i = 0; i < ext_sigcount; i++) {
- if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
+ return get_matching_microcode(csig, cpf, mc_intel, crev);
}
-static int apply_microcode(int cpu)
+int apply_microcode(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
@@ -300,6 +144,14 @@ static int apply_microcode(int cpu)
if (mc_intel == NULL)
return 0;
+ /*
+ * Microcode on this CPU could be updated earlier. Only apply the
+ * microcode patch in mc_intel when it is newer than the one on this
+ * CPU.
+ */
+ if (get_matching_mc(mc_intel, cpu) == 0)
+ return 0;
+
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) mc_intel->bits,
@@ -338,6 +190,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
unsigned int curr_mc_size = 0;
+ unsigned int csig, cpf;
while (leftover) {
struct microcode_header_intel mc_header;
@@ -362,11 +215,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
}
if (get_ucode_data(mc, ucode_ptr, mc_size) ||
- microcode_sanity_check(mc) < 0) {
+ microcode_sanity_check(mc, 1) < 0) {
break;
}
- if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
+ csig = uci->cpu_sig.sig;
+ cpf = uci->cpu_sig.pf;
+ if (get_matching_microcode(csig, cpf, mc, new_rev)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
@@ -393,6 +248,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
+ /*
+ * If early loading microcode is supported, save this mc into
+ * permanent memory. So it will be loaded early when a CPU is hot added
+ * or resumes.
+ */
+ save_mc_for_early(new_mc);
+
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
out:
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c
new file mode 100644
index 000000000000..7890bc838952
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel_early.c
@@ -0,0 +1,796 @@
+/*
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This allows to early upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ * Software Developer's Manual.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+struct mc_saved_data {
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state __cpuinit
+generic_load_microcode_early(struct microcode_intel **mc_saved_p,
+ unsigned int mc_saved_count,
+ struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *ucode_ptr, *new_mc = NULL;
+ int new_rev = uci->cpu_sig.rev;
+ enum ucode_state state = UCODE_OK;
+ unsigned int mc_size;
+ struct microcode_header_intel *mc_header;
+ unsigned int csig = uci->cpu_sig.sig;
+ unsigned int cpf = uci->cpu_sig.pf;
+ int i;
+
+ for (i = 0; i < mc_saved_count; i++) {
+ ucode_ptr = mc_saved_p[i];
+
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+ mc_size = get_totalsize(mc_header);
+ if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
+ new_rev = mc_header->rev;
+ new_mc = ucode_ptr;
+ }
+ }
+
+ if (!new_mc) {
+ state = UCODE_NFOUND;
+ goto out;
+ }
+
+ uci->mc = (struct microcode_intel *)new_mc;
+out:
+ return state;
+}
+
+static void __cpuinit
+microcode_pointer(struct microcode_intel **mc_saved,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start, int mc_saved_count)
+{
+ int i;
+
+ for (i = 0; i < mc_saved_count; i++)
+ mc_saved[i] = (struct microcode_intel *)
+ (mc_saved_in_initrd[i] + initrd_start);
+}
+
+#ifdef CONFIG_X86_32
+static void __cpuinit
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+ struct mc_saved_data *mc_saved_data)
+{
+ int i;
+ struct microcode_intel ***mc_saved;
+
+ mc_saved = (struct microcode_intel ***)
+ __pa_symbol(&mc_saved_data->mc_saved);
+ for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+ struct microcode_intel *p;
+
+ p = *(struct microcode_intel **)
+ __pa(mc_saved_data->mc_saved + i);
+ mc_saved_tmp[i] = (struct microcode_intel *)__pa(p);
+ }
+}
+#endif
+
+static enum ucode_state __cpuinit
+load_microcode(struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start,
+ struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int count = mc_saved_data->mc_saved_count;
+
+ if (!mc_saved_data->mc_saved) {
+ microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
+ initrd_start, count);
+
+ return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ } else {
+#ifdef CONFIG_X86_32
+ microcode_phys(mc_saved_tmp, mc_saved_data);
+ return generic_load_microcode_early(mc_saved_tmp, count, uci);
+#else
+ return generic_load_microcode_early(mc_saved_data->mc_saved,
+ count, uci);
+#endif
+ }
+}
+
+static u8 get_x86_family(unsigned long sig)
+{
+ u8 x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+
+static u8 get_x86_model(unsigned long sig)
+{
+ u8 x86, x86_model;
+
+ x86 = get_x86_family(sig);
+ x86_model = (sig >> 4) & 0xf;
+
+ if (x86 == 0x6 || x86 == 0xf)
+ x86_model += ((sig >> 16) & 0xf) << 4;
+
+ return x86_model;
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+ unsigned long sig)
+{
+ u8 x86, x86_model;
+ u8 x86_ucode, x86_model_ucode;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ unsigned long data_size = get_datasize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ x86 = get_x86_family(sig);
+ x86_model = get_x86_model(sig);
+
+ x86_ucode = get_x86_family(mc_header->sig);
+ x86_model_ucode = get_x86_model(mc_header->sig);
+
+ if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ return UCODE_OK;
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ return UCODE_NFOUND;
+
+ ext_header = (struct extended_sigtable *)
+ mc_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ x86_ucode = get_x86_family(ext_sig->sig);
+ x86_model_ucode = get_x86_model(ext_sig->sig);
+
+ if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ return UCODE_OK;
+
+ ext_sig++;
+ }
+
+ return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+ struct microcode_intel **mc_saved_src,
+ unsigned int mc_saved_count)
+{
+ int i, j;
+ struct microcode_intel **mc_saved_p;
+ int ret;
+
+ if (!mc_saved_count)
+ return -EINVAL;
+
+ /*
+ * Copy new microcode data.
+ */
+ mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
+ GFP_KERNEL);
+ if (!mc_saved_p)
+ return -ENOMEM;
+
+ for (i = 0; i < mc_saved_count; i++) {
+ struct microcode_intel *mc = mc_saved_src[i];
+ struct microcode_header_intel *mc_header = &mc->hdr;
+ unsigned long mc_size = get_totalsize(mc_header);
+ mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
+ if (!mc_saved_p[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!mc_saved_src[i]) {
+ ret = -EINVAL;
+ goto err;
+ }
+ memcpy(mc_saved_p[i], mc, mc_size);
+ }
+
+ /*
+ * Point to newly saved microcode.
+ */
+ mc_saved_data->mc_saved = mc_saved_p;
+ mc_saved_data->mc_saved_count = mc_saved_count;
+
+ return 0;
+
+err:
+ for (j = 0; j <= i; j++)
+ kfree(mc_saved_p[j]);
+ kfree(mc_saved_p);
+
+ return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ * patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ */
+static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
+ unsigned int *mc_saved_count_p)
+{
+ int i;
+ int found = 0;
+ unsigned int mc_saved_count = *mc_saved_count_p;
+ struct microcode_header_intel *mc_header;
+
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+ for (i = 0; i < mc_saved_count; i++) {
+ unsigned int sig, pf;
+ unsigned int new_rev;
+ struct microcode_header_intel *mc_saved_header =
+ (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ new_rev = mc_header->rev;
+
+ if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
+ found = 1;
+ if (update_match_revision(mc_header, new_rev)) {
+ /*
+ * Found an older ucode saved before.
+ * Replace the older one with this newer
+ * one.
+ */
+ mc_saved[i] =
+ (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+ }
+ }
+ if (i >= mc_saved_count && !found)
+ /*
+ * This ucode is first time discovered in ucode file.
+ * Save it to memory.
+ */
+ mc_saved[mc_saved_count++] =
+ (struct microcode_intel *)ucode_ptr;
+
+ *mc_saved_count_p = mc_saved_count;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+ void *data, size_t size,
+ struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ struct ucode_cpu_info *uci)
+{
+ u8 *ucode_ptr = data;
+ unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
+ unsigned int mc_size;
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+ int i;
+
+ while (leftover) {
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > leftover ||
+ microcode_sanity_check(ucode_ptr, 0) < 0)
+ break;
+
+ leftover -= mc_size;
+
+ /*
+ * Since APs with same family and model as the BSP may boot in
+ * the platform, we need to find and save microcode patches
+ * with the same family and model as the BSP.
+ */
+ if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+ UCODE_OK) {
+ ucode_ptr += mc_size;
+ continue;
+ }
+
+ _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+
+ ucode_ptr += mc_size;
+ }
+
+ if (leftover) {
+ state = UCODE_ERROR;
+ goto out;
+ }
+
+ if (mc_saved_count == 0) {
+ state = UCODE_NFOUND;
+ goto out;
+ }
+
+ for (i = 0; i < mc_saved_count; i++)
+ mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+ mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+ return state;
+}
+
+#define native_rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = native_read_msr((msr)); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
+} while (0)
+
+#define native_wrmsr(msr, low, high) \
+ native_write_msr(msr, low, high);
+
+static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ u8 x86, x86_model;
+ struct cpu_signature csig;
+ unsigned int eax, ebx, ecx, edx;
+
+ csig.sig = 0;
+ csig.pf = 0;
+ csig.rev = 0;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ x86 = get_x86_family(csig.sig);
+ x86_model = get_x86_model(csig.sig);
+
+ if ((x86_model >= 5) || (x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+ csig.rev = val[1];
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+
+#ifdef DEBUG
+static void __ref show_saved_mc(void)
+{
+ int i, j;
+ unsigned int sig, pf, rev, total_size, data_size, date;
+ struct ucode_cpu_info uci;
+
+ if (mc_saved_data.mc_saved_count == 0) {
+ pr_debug("no micorcode data saved.\n");
+ return;
+ }
+ pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+ collect_cpu_info_early(&uci);
+
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
+ pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
+ smp_processor_id(), sig, pf, rev);
+
+ for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+ struct microcode_header_intel *mc_saved_header;
+ struct extended_sigtable *ext_header;
+ int ext_sigcount;
+ struct extended_signature *ext_sig;
+
+ mc_saved_header = (struct microcode_header_intel *)
+ mc_saved_data.mc_saved[i];
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
+ date = mc_saved_header->date;
+
+ pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+ i, sig, pf, rev, total_size,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ continue;
+
+ ext_header = (struct extended_sigtable *)
+ mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (j = 0; j < ext_sigcount; j++) {
+ sig = ext_sig->sig;
+ pf = ext_sig->pf;
+
+ pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+ j, sig, pf);
+
+ ext_sig++;
+ }
+
+ }
+}
+#else
+static inline void show_saved_mc(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count_init;
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+ int ret = 0;
+ int i;
+
+ /*
+ * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+ * hotplug.
+ */
+ cpu_hotplug_driver_lock();
+
+ mc_saved_count_init = mc_saved_data.mc_saved_count;
+ mc_saved_count = mc_saved_data.mc_saved_count;
+ mc_saved = mc_saved_data.mc_saved;
+
+ if (mc_saved && mc_saved_count)
+ memcpy(mc_saved_tmp, mc_saved,
+ mc_saved_count * sizeof(struct mirocode_intel *));
+ /*
+ * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+ * version.
+ */
+
+ _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+
+ /*
+ * Save the mc_save_tmp in global mc_saved_data.
+ */
+ ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+ if (ret) {
+ pr_err("Can not save microcode patch.\n");
+ goto out;
+ }
+
+ show_saved_mc();
+
+ /*
+ * Free old saved microcod data.
+ */
+ if (mc_saved) {
+ for (i = 0; i < mc_saved_count_init; i++)
+ kfree(mc_saved[i]);
+ kfree(mc_saved);
+ }
+
+out:
+ cpu_hotplug_driver_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(unsigned long start, unsigned long end,
+ struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ struct ucode_cpu_info *uci)
+{
+ unsigned int size = end - start + 1;
+ struct cpio_data cd;
+ long offset = 0;
+#ifdef CONFIG_X86_32
+ char *p = (char *)__pa_symbol(ucode_name);
+#else
+ char *p = ucode_name;
+#endif
+
+ cd.data = NULL;
+ cd.size = 0;
+
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data)
+ return UCODE_ERROR;
+
+
+ return get_matching_model_microcode(0, start, cd.data, cd.size,
+ mc_saved_data, mc_saved_in_initrd,
+ uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void __cpuinit
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+ int cpu = smp_processor_id();
+
+ pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ cpu,
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void __cpuinit show_ucode_info_early(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (delay_ucode_info) {
+ collect_cpu_info_early(&uci);
+ print_ucode_info(&uci, current_mc_date);
+ delay_ucode_info = 0;
+ }
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void __cpuinit print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+ int *delay_ucode_info_p;
+ int *current_mc_date_p;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info);
+ current_mc_date_p = (int *)__pa_symbol(&current_mc_date);
+
+ *delay_ucode_info_p = 1;
+ *current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void __cpuinit flush_tlb_early(void)
+{
+ __native_flush_tlb_global_irq_disabled();
+}
+
+static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
+ struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+ unsigned int val[2];
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return 0;
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsr(MSR_IA32_UCODE_WRITE,
+ (unsigned long) mc_intel->bits,
+ (unsigned long) mc_intel->bits >> 16 >> 16);
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+ if (val[1] != mc_intel->hdr.rev)
+ return -1;
+
+#ifdef CONFIG_X86_64
+ /* Flush global tlb. This is precaution. */
+ flush_tlb_early();
+#endif
+ uci->cpu_sig.rev = val[1];
+
+ print_ucode(uci);
+
+ return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd(void)
+{
+ unsigned int count = mc_saved_data.mc_saved_count;
+ struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+ int ret = 0;
+
+ if (count == 0)
+ return ret;
+
+ microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ ret = save_microcode(&mc_saved_data, mc_saved, count);
+ if (ret)
+ pr_err("Can not save microcod patches from initrd");
+
+ show_saved_mc();
+
+ return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start_early,
+ unsigned long initrd_end_early,
+ struct ucode_cpu_info *uci)
+{
+ collect_cpu_info_early(uci);
+ scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
+ mc_saved_in_initrd, uci);
+ load_microcode(mc_saved_data, mc_saved_in_initrd,
+ initrd_start_early, uci);
+ apply_microcode_early(mc_saved_data, uci);
+}
+
+void __init
+load_ucode_intel_bsp(void)
+{
+ u64 ramdisk_image, ramdisk_size;
+ unsigned long initrd_start_early, initrd_end_early;
+ struct ucode_cpu_info uci;
+#ifdef CONFIG_X86_32
+ struct boot_params *boot_params_p;
+
+ boot_params_p = (struct boot_params *)__pa_symbol(&boot_params);
+ ramdisk_image = boot_params_p->hdr.ramdisk_image;
+ ramdisk_size = boot_params_p->hdr.ramdisk_size;
+ initrd_start_early = ramdisk_image;
+ initrd_end_early = initrd_start_early + ramdisk_size;
+
+ _load_ucode_intel_bsp(
+ (struct mc_saved_data *)__pa_symbol(&mc_saved_data),
+ (unsigned long *)__pa_symbol(&mc_saved_in_initrd),
+ initrd_start_early, initrd_end_early, &uci);
+#else
+ ramdisk_image = boot_params.hdr.ramdisk_image;
+ ramdisk_size = boot_params.hdr.ramdisk_size;
+ initrd_start_early = ramdisk_image + PAGE_OFFSET;
+ initrd_end_early = initrd_start_early + ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
+ initrd_start_early, initrd_end_early, &uci);
+#endif
+}
+
+void __cpuinit load_ucode_intel_ap(void)
+{
+ struct mc_saved_data *mc_saved_data_p;
+ struct ucode_cpu_info uci;
+ unsigned long *mc_saved_in_initrd_p;
+ unsigned long initrd_start_addr;
+#ifdef CONFIG_X86_32
+ unsigned long *initrd_start_p;
+
+ mc_saved_in_initrd_p =
+ (unsigned long *)__pa_symbol(mc_saved_in_initrd);
+ mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data);
+ initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start);
+ initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p);
+#else
+ mc_saved_data_p = &mc_saved_data;
+ mc_saved_in_initrd_p = mc_saved_in_initrd;
+ initrd_start_addr = initrd_start;
+#endif
+
+ /*
+ * If there is no valid ucode previously saved in memory, no need to
+ * update ucode on this AP.
+ */
+ if (mc_saved_data_p->mc_saved_count == 0)
+ return;
+
+ collect_cpu_info_early(&uci);
+ load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+ apply_microcode_early(mc_saved_data_p, &uci);
+}
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c
new file mode 100644
index 000000000000..ce69320d0179
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel_lib.c
@@ -0,0 +1,174 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ *
+ * For more information, go to http://www.urbanmyth.org/microcode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+static inline int
+update_match_cpu(unsigned int csig, unsigned int cpf,
+ unsigned int sig, unsigned int pf)
+{
+ return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+}
+
+int
+update_match_revision(struct microcode_header_intel *mc_header, int rev)
+{
+ return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+int microcode_sanity_check(void *mc, int print_err)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ int sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("error! Bad data size in microcode data file\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("error! Unknown microcode update format\n");
+ return -EINVAL;
+ }
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("error! Small exttable size in microcode data file\n");
+ return -EINVAL;
+ }
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("error! Bad exttable size in microcode data file\n");
+ return -EFAULT;
+ }
+ ext_sigcount = ext_header->count;
+ }
+
+ /* check extended table checksum */
+ if (ext_table_size) {
+ int ext_table_sum = 0;
+ int *ext_tablep = (int *)ext_header;
+
+ i = ext_table_size / DWSIZE;
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("aborting, bad extended signature table checksum\n");
+ return -EINVAL;
+ }
+ }
+
+ /* calculate the checksum */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ while (i--)
+ orig_sum += ((int *)mc)[i];
+ if (orig_sum) {
+ if (print_err)
+ pr_err("aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ if (!ext_table_size)
+ return 0;
+ /* check extended signature checksum */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+ sum = orig_sum
+ - (mc_header->sig + mc_header->pf + mc_header->cksum)
+ + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(microcode_sanity_check);
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+{
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+{
+ struct microcode_header_intel *mc_header = mc;
+
+ if (!update_match_revision(mc_header, rev))
+ return 0;
+
+ return get_matching_sig(csig, cpf, mc, rev);
+}
+EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2ed787f15bf0..14ae10031ff0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -268,13 +268,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(pm_idle);
-#endif
+static void (*x86_idle)(void);
#ifndef CONFIG_SMP
static inline void play_dead(void)
@@ -351,7 +345,7 @@ void cpu_idle(void)
rcu_idle_enter();
if (cpuidle_idle_call())
- pm_idle();
+ x86_idle();
rcu_idle_exit();
start_critical_timings();
@@ -375,7 +369,6 @@ void cpu_idle(void)
*/
void default_idle(void)
{
- trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle_rcuidle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
@@ -389,21 +382,22 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
- trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(default_idle);
#endif
-bool set_pm_idle_to_default(void)
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void)
{
- bool ret = !!pm_idle;
+ bool ret = !!x86_idle;
- pm_idle = default_idle;
+ x86_idle = default_idle;
return ret;
}
+#endif
void stop_this_cpu(void *dummy)
{
local_irq_disable();
@@ -413,31 +407,8 @@ void stop_this_cpu(void *dummy)
set_cpu_online(smp_processor_id(), false);
disable_local_APIC();
- for (;;) {
- if (hlt_works(smp_processor_id()))
- halt();
- }
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
- if (!need_resched()) {
- trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
- trace_cpu_idle_rcuidle(1, smp_processor_id());
- if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
-
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
- trace_power_end_rcuidle(smp_processor_id());
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
- } else
- local_irq_enable();
+ for (;;)
+ halt();
}
/*
@@ -447,62 +418,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
- trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}
-/*
- * mwait selection logic:
- *
- * It depends on the CPU. For AMD CPUs that support MWAIT this is
- * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
- * then depend on a clock divisor and current Pstate of the core. If
- * all cores of a processor are in halt state (C1) the processor can
- * enter the C1E (C1 enhanced) state. If mwait is used this will never
- * happen.
- *
- * idle=mwait overrides this decision and forces the usage of mwait.
- */
-
-#define MWAIT_INFO 0x05
-#define MWAIT_ECX_EXTENDED_INFO 0x01
-#define MWAIT_EDX_C1 0xf0
-
-int mwait_usable(const struct cpuinfo_x86 *c)
-{
- u32 eax, ebx, ecx, edx;
-
- /* Use mwait if idle=mwait boot option is given */
- if (boot_option_idle_override == IDLE_FORCE_MWAIT)
- return 1;
-
- /*
- * Any idle= boot option other than idle=mwait means that we must not
- * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
- */
- if (boot_option_idle_override != IDLE_NO_OVERRIDE)
- return 0;
-
- if (c->cpuid_level < MWAIT_INFO)
- return 0;
-
- cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
- /* Check, whether EDX has extended info about MWAIT */
- if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
- return 1;
-
- /*
- * edx enumeratios MONITOR/MWAIT extensions. Check, whether
- * C1 supports MWAIT
- */
- return (edx & MWAIT_EDX_C1);
-}
-
bool amd_e400_c1e_detected;
EXPORT_SYMBOL(amd_e400_c1e_detected);
@@ -567,31 +489,24 @@ static void amd_e400_idle(void)
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
+ if (x86_idle == poll_idle && smp_num_siblings > 1)
pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
- }
#endif
- if (pm_idle)
+ if (x86_idle)
return;
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * One CPU supports mwait => All CPUs supports mwait
- */
- pr_info("using mwait in idle threads\n");
- pm_idle = mwait_idle;
- } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+ if (cpu_has_amd_erratum(amd_erratum_400)) {
/* E400: APIC timer interrupt does not wake up CPU from C1e */
pr_info("using AMD E400 aware idle routine\n");
- pm_idle = amd_e400_idle;
+ x86_idle = amd_e400_idle;
} else
- pm_idle = default_idle;
+ x86_idle = default_idle;
}
void __init init_amd_e400_c1e_mask(void)
{
/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
- if (pm_idle == amd_e400_idle)
+ if (x86_idle == amd_e400_idle)
zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
}
@@ -602,11 +517,8 @@ static int __init idle_setup(char *str)
if (!strcmp(str, "poll")) {
pr_info("using polling idle threads\n");
- pm_idle = poll_idle;
+ x86_idle = poll_idle;
boot_option_idle_override = IDLE_POLL;
- } else if (!strcmp(str, "mwait")) {
- boot_option_idle_override = IDLE_FORCE_MWAIT;
- WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
} else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
@@ -615,7 +527,7 @@ static int __init idle_setup(char *str)
* To continue to load the CPU idle driver, don't touch
* the boot_option_idle_override.
*/
- pm_idle = default_idle;
+ x86_idle = default_idle;
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6e68a6194965..0f49677da51e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,7 +117,7 @@ void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
if (dead_task->mm->context.size) {
- pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+ pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt,
dead_task->mm->context.size);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8b24289cc10c..9c857f05cef0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
#include <asm/topology.h>
#include <asm/apicdef.h>
#include <asm/amd_nb.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
#include <asm/mce.h>
#include <asm/alternative.h>
#include <asm/prom.h>
/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
+#ifdef CONFIG_X86_32
static void __init cleanup_highmap(void)
{
}
@@ -296,8 +284,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_reserve(__pa(_brk_start),
- __pa(_brk_end) - __pa(_brk_start));
+ memblock_reserve(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
#ifdef CONFIG_BLK_DEV_INITRD
+static u64 __init get_ramdisk_image(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ return ramdisk_size;
+}
+
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
- /* We need to move the initrd down into lowmem */
- ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
- PAGE_SIZE);
+ /* We need to move the initrd down into directly mapped mem */
+ ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ area_size, PAGE_SIZE);
if (!ramdisk_here)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
- /* Note: this includes all the lowmem currently occupied by
+ /* Note: this includes all the mem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
q = (char *)initrd_start;
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-
- /* Copy the highmem portion of the initrd */
+ /* Copy the initrd */
while (ramdisk_size) {
slop = ramdisk_image & ~PAGE_MASK;
clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
ramdisk_image += clen;
ramdisk_size -= clen;
}
- /* high pages is not converted by early_res_to_bootmem */
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_image = get_ramdisk_image();
+ ramdisk_size = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
+static void __init early_reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
static void __init reserve_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+ u64 mapped_size;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
initrd_start = 0;
- if (ramdisk_size >= (end_of_lowmem>>1)) {
+ mapped_size = memblock_mem_size(max_pfn_mapped);
+ if (ramdisk_size >= (mapped_size>>1))
panic("initrd too large to handle, "
"disabling initrd (%lld needed, %lld available)\n",
- ramdisk_size, end_of_lowmem>>1);
- }
+ ramdisk_size, mapped_size>>1);
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
-
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- /*
- * don't need to reserve again, already reserved early
- * in i386_start_kernel
- */
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
+static void __init early_reserve_initrd(void)
+{
+}
static void __init reserve_initrd(void)
{
}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
struct setup_data *data;
u64 pa_data;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
u64 pa_data;
int found = 0;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
struct setup_data *data;
u64 pa_data;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
- * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
- * limit once kexec-tools are fixed.
*/
#ifdef CONFIG_X86_32
# define CRASH_KERNEL_ADDR_MAX (512 << 20)
#else
-# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+# define CRASH_KERNEL_ADDR_MAX MAXMEM
#endif
+static void __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+ const unsigned long long alignment = 16<<20; /* 16M */
+ unsigned long long low_base = 0, low_size = 0;
+ unsigned long total_low_mem;
+ unsigned long long base;
+ int ret;
+
+ total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+ ret = parse_crashkernel_low(boot_command_line, total_low_mem,
+ &low_size, &base);
+ if (ret != 0 || low_size <= 0)
+ return;
+
+ low_base = memblock_find_in_range(low_size, (1ULL<<32),
+ low_size, alignment);
+
+ if (!low_base) {
+ pr_info("crashkernel low reservation failed - No suitable area found.\n");
+
+ return;
+ }
+
+ memblock_reserve(low_base, low_size);
+ pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+ (unsigned long)(low_size >> 20),
+ (unsigned long)(low_base >> 20),
+ (unsigned long)(total_low_mem >> 20));
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+}
+
static void __init reserve_crashkernel(void)
{
+ const unsigned long long alignment = 16<<20; /* 16M */
unsigned long long total_mem;
unsigned long long crash_size, crash_base;
int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
/* 0 means: find the address automatically */
if (crash_base <= 0) {
- const unsigned long long alignment = 16<<20; /* 16M */
-
/*
* kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
*/
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
+
} else {
unsigned long long start;
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
+
+ if (crash_base >= (1ULL<<32))
+ reserve_crashkernel_low();
}
#else
static void __init reserve_crashkernel(void)
@@ -608,8 +644,6 @@ static __init void reserve_ibft_region(void)
memblock_reserve(addr, size);
}
-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
-
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
@@ -698,8 +732,7 @@ static void __init trim_bios_range(void)
* since some BIOSes are known to corrupt low memory. See the
* Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
- E820_RAM, E820_RESERVED);
+ e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
/*
* special case: Some BIOSen report the PC BIOS
@@ -711,6 +744,29 @@ static void __init trim_bios_range(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
+
+ /*
+ * Complain if .text .data and .bss are not marked as E820_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820_all_mapped(start, start + size, E820_RAM))
+ return;
+
+ pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+ e820_remove_range(start, size, E820_RAM, 0);
+ e820_add_region(start, size, E820_RAM);
+}
+
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
static int __init parse_reservelow(char *p)
{
unsigned long long size;
@@ -733,6 +789,11 @@ static int __init parse_reservelow(char *p)
early_param("reservelow", parse_reservelow);
+static void __init trim_low_memory_range(void)
+{
+ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -748,6 +809,17 @@ early_param("reservelow", parse_reservelow);
void __init setup_arch(char **cmdline_p)
{
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__bss_stop - (unsigned long)_text);
+
+ early_reserve_initrd();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
@@ -835,12 +907,12 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ data_resource.start = __pa_symbol(_etext);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
@@ -906,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
+ e820_add_kernel_range();
trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
@@ -955,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
reserve_ibft_region();
+ early_alloc_pgt_buf();
+
/*
* Need to conclude brk, before memblock_x86_fill()
* it could use memblock_find_in_range, could overlap with
@@ -964,7 +1039,7 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock.current_limit = get_max_mapped();
+ memblock.current_limit = ISA_END_ADDRESS;
memblock_x86_fill();
/*
@@ -981,41 +1056,31 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
+ /*
+ * In the memory hotplug case, the kernel needs info from SRAT to
+ * determine which memory is hotpluggable before allocating memory
+ * using memblock.
+ */
+ acpi_boot_table_init();
+ early_acpi_boot_init();
+ early_parse_srat();
+
+#ifdef CONFIG_X86_32
printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
(max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
- setup_real_mode();
+ reserve_real_mode();
trim_platform_memory_ranges();
+ trim_low_memory_range();
- init_gbpages();
+ init_mem_mapping();
- /* max_pfn_mapped is updated here */
- max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- max_pfn_mapped = max_low_pfn_mapped;
+ early_trap_pf_init();
-#ifdef CONFIG_X86_64
- if (max_pfn > max_low_pfn) {
- int i;
- unsigned long start, end;
- unsigned long start_pfn, end_pfn;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
- NULL) {
-
- end = PFN_PHYS(end_pfn);
- if (end <= (1UL<<32))
- continue;
-
- start = PFN_PHYS(start_pfn);
- max_pfn_mapped = init_memory_mapping(
- max((1UL<<32), start), end);
- }
+ setup_real_mode();
- /* can we preseve max_low_pfn ?*/
- max_low_pfn = max_pfn;
- }
-#endif
memblock.current_limit = get_max_mapped();
dma_contiguous_reserve(0);
@@ -1045,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
/*
* Parse the ACPI tables for possible boot-time SMP configuration.
*/
- acpi_boot_table_init();
-
- early_acpi_boot_init();
-
initmem_init();
memblock_find_dma_reserve();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d6bf1f34a6e9..69562992e457 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -278,7 +278,7 @@ static const struct {
};
static int
-__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
struct pt_regs *regs)
{
struct sigframe __user *frame;
@@ -286,7 +286,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
int err = 0;
void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -307,8 +307,8 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
else
restorer = &frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
/* Set up to return from userspace. */
err |= __put_user(restorer, &frame->pretcode);
@@ -327,7 +327,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
/* Set up registers for signal handler */
regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
regs->ax = (unsigned long)sig;
regs->dx = 0;
regs->cx = 0;
@@ -340,7 +340,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
return 0;
}
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
@@ -348,7 +348,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
int err = 0;
void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -368,8 +368,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up to return from userspace. */
restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
/*
@@ -382,7 +382,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
} put_user_catch(err);
- err |= copy_siginfo_to_user(&frame->info, info);
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -392,7 +392,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up registers for signal handler */
regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
regs->ax = (unsigned long)sig;
regs->dx = (unsigned long)&frame->info;
regs->cx = (unsigned long)&frame->uc;
@@ -405,20 +405,20 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return 0;
}
#else /* !CONFIG_X86_32 */
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
void __user *fp = NULL;
int err = 0;
- frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, info))
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
return -EFAULT;
}
@@ -434,8 +434,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
/* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
err |= -EFAULT;
@@ -457,7 +457,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
next argument after the signal number on the stack. */
regs->si = (unsigned long)&frame->info;
regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ka->sa.sa_handler;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
regs->sp = (unsigned long)frame;
@@ -469,8 +469,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
}
#endif /* CONFIG_X86_32 */
-static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
- siginfo_t *info, compat_sigset_t *set,
+static int x32_setup_rt_frame(struct ksignal *ksig,
+ compat_sigset_t *set,
struct pt_regs *regs)
{
#ifdef CONFIG_X86_X32_ABI
@@ -479,13 +479,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
int err = 0;
void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user32(&frame->info, info))
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user32(&frame->info, &ksig->info))
return -EFAULT;
}
@@ -499,8 +499,8 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
put_user_ex(0, &frame->uc.uc__pad0);
- if (ka->sa.sa_flags & SA_RESTORER) {
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
} else {
/* could use a vstub here */
restorer = NULL;
@@ -518,10 +518,10 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
/* Set up registers for signal handler */
regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* We use the x32 calling convention here... */
- regs->di = sig;
+ regs->di = ksig->sig;
regs->si = (unsigned long) &frame->info;
regs->dx = (unsigned long) &frame->uc;
@@ -535,70 +535,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
return 0;
}
-#ifdef CONFIG_X86_32
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- sigset_t blocked;
- siginitset(&blocked, mask);
- return sigsuspend(&blocked);
-}
-
-asmlinkage int
-sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret = 0;
-
- if (act) {
- old_sigset_t mask;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
- get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
- get_user_ex(mask, &act->sa_mask);
- get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
- } get_user_catch(ret);
-
- if (ret)
- return -EFAULT;
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
- put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
- put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
- put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
- } put_user_catch(ret);
-
- if (ret)
- return -EFAULT;
- }
-
- return ret;
-}
-#endif /* CONFIG_X86_32 */
-
/*
* Do a signal return; undo the signal stack.
*/
#ifdef CONFIG_X86_32
-unsigned long sys_sigreturn(struct pt_regs *regs)
+unsigned long sys_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct sigframe __user *frame;
unsigned long ax;
sigset_t set;
@@ -625,8 +568,9 @@ badframe:
}
#endif /* CONFIG_X86_32 */
-long sys_rt_sigreturn(struct pt_regs *regs)
+long sys_rt_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct rt_sigframe __user *frame;
unsigned long ax;
sigset_t set;
@@ -667,30 +611,29 @@ static int signr_convert(int sig)
}
static int
-setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- struct pt_regs *regs)
+setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = signr_convert(sig);
+ int usig = signr_convert(ksig->sig);
sigset_t *set = sigmask_to_save();
compat_sigset_t *cset = (compat_sigset_t *) set;
/* Set up the stack frame */
if (is_ia32_frame()) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- return ia32_setup_rt_frame(usig, ka, info, cset, regs);
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+ return ia32_setup_rt_frame(usig, ksig, cset, regs);
else
- return ia32_setup_frame(usig, ka, cset, regs);
+ return ia32_setup_frame(usig, ksig, cset, regs);
} else if (is_x32_frame()) {
- return x32_setup_rt_frame(usig, ka, info, cset, regs);
+ return x32_setup_rt_frame(ksig, cset, regs);
} else {
- return __setup_rt_frame(sig, ka, info, set, regs);
+ return __setup_rt_frame(ksig->sig, ksig, set, regs);
}
}
static void
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- struct pt_regs *regs)
+handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
+ bool failed;
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -701,7 +644,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
break;
case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
regs->ax = -EINTR;
break;
}
@@ -721,26 +664,21 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- if (setup_rt_frame(sig, ka, info, regs) < 0) {
- force_sigsegv(sig, current);
- return;
+ failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ */
+ regs->flags &= ~X86_EFLAGS_DF;
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
}
-
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
-
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
-
- signal_delivered(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
+ signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
#ifdef CONFIG_X86_32
@@ -757,14 +695,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
static void do_signal(struct pt_regs *regs)
{
- struct k_sigaction ka;
- siginfo_t info;
- int signr;
+ struct ksignal ksig;
- signr = get_signal_to_deliver(&info, &ka, regs, NULL);
- if (signr > 0) {
+ if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
- handle_signal(signr, &info, &ka, regs);
+ handle_signal(&ksig, regs);
return;
}
@@ -843,8 +778,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
}
#ifdef CONFIG_X86_X32_ABI
-asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
+asmlinkage long sys32_x32_rt_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_x32 __user *frame;
sigset_t set;
unsigned long ax;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ed0fe385289d..a6ceaedc396a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1369,7 +1369,7 @@ static inline void mwait_play_dead(void)
void *mwait_ptr;
struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
- if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
+ if (!this_cpu_has(X86_FEATURE_MWAIT))
return;
if (!this_cpu_has(X86_FEATURE_CLFLSH))
return;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
/* int3 can be called from all */
set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+#ifdef CONFIG_X86_32
set_intr_gate(X86_TRAP_PF, &page_fault);
+#endif
load_idt(&idt_descr);
}
+void __init early_trap_pf_init(void)
+{
+#ifdef CONFIG_X86_64
+ set_intr_gate(X86_TRAP_PF, &page_fault);
+#endif
+}
+
void __init trap_init(void)
{
int i;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 1dfe69cc78a8..1cf5766dde16 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -202,7 +202,7 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
+int sys_vm86old(struct vm86_struct __user *v86)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -222,7 +222,7 @@ int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
if (tmp)
goto out;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = regs;
+ info.regs32 = current_pt_regs();
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
ret = 0; /* we never return here */
@@ -231,7 +231,7 @@ out:
}
-int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
+int sys_vm86(unsigned long cmd, unsigned long arg)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -272,7 +272,7 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
ret = -EFAULT;
if (tmp)
goto out;
- info.regs32 = regs;
+ info.regs32 = current_pt_regs();
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1330dd102950..b014d9414d08 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
+#ifndef CONFIG_DEBUG_VIRTUAL
+EXPORT_SYMBOL(phys_base);
+#endif
EXPORT_SYMBOL(empty_zero_page);
#ifndef CONFIG_PARAVIRT
EXPORT_SYMBOL(native_load_gs_index);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d065d67c2672..45a14dbbddaf 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -63,10 +63,6 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
- .mapping = {
- .pagetable_reserve = native_pagetable_reserve,
- },
-
.paging = {
.pagetable_init = native_pagetable_init,
},
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a27e76371108..a335cc6cde72 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
#include "kvm_cache_regs.h"
#include <linux/module.h>
#include <asm/kvm_emulate.h>
+#include <linux/stringify.h>
#include "x86.h"
#include "tss.h"
@@ -43,7 +44,7 @@
#define OpCL 9ull /* CL register (for shifts) */
#define OpImmByte 10ull /* 8-bit sign extended immediate */
#define OpOne 11ull /* Implied 1 */
-#define OpImm 12ull /* Sign extended immediate */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
#define OpMem16 13ull /* Memory operand (16-bit). */
#define OpMem32 14ull /* Memory operand (32-bit). */
#define OpImmU 15ull /* Immediate operand, zero extended */
@@ -58,6 +59,7 @@
#define OpFS 24ull /* FS */
#define OpGS 25ull /* GS */
#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
#define OpBits 5 /* Width of operand field */
#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +103,7 @@
#define SrcMemFAddr (OpMemFAddr << SrcShift)
#define SrcAcc (OpAcc << SrcShift)
#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
#define SrcDX (OpDX << SrcShift)
#define SrcMem8 (OpMem8 << SrcShift)
#define SrcMask (OpMask << SrcShift)
@@ -113,6 +116,7 @@
#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
#define Sse (1<<18) /* SSE Vector instruction */
/* Generic ModRM decode. */
#define ModRM (1<<19)
@@ -146,6 +150,8 @@
#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
+#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
+#define NoWrite ((u64)1 << 45) /* No writeback */
#define X2(x...) x, x
#define X3(x...) X2(x), x
@@ -156,6 +162,27 @@
#define X8(x...) X4(x), X4(x)
#define X16(x...) X8(x), X8(x)
+#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
+#define FASTOP_SIZE 8
+
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst: [rdx]:rax (in/out)
+ * src: rbx (in/out)
+ * src2: rcx (in)
+ * flags: rflags (in/out)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ *
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+
+struct fastop;
+
struct opcode {
u64 flags : 56;
u64 intercept : 8;
@@ -164,6 +191,8 @@ struct opcode {
const struct opcode *group;
const struct group_dual *gdual;
const struct gprefix *gprefix;
+ const struct escape *esc;
+ void (*fastop)(struct fastop *fake);
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
@@ -180,6 +209,11 @@ struct gprefix {
struct opcode pfx_f3;
};
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
} \
} while (0)
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
+
+#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
+#define FOP_RET "ret \n\t"
+
+#define FOP_START(op) \
+ extern void em_##op(struct fastop *fake); \
+ asm(".pushsection .text, \"ax\" \n\t" \
+ ".global em_" #op " \n\t" \
+ FOP_ALIGN \
+ "em_" #op ": \n\t"
+
+#define FOP_END \
+ ".popsection")
+
+#define FOPNOP() FOP_ALIGN FOP_RET
+
+#define FOP1E(op, dst) \
+ FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
+
+#define FASTOP1(op) \
+ FOP_START(op) \
+ FOP1E(op##b, al) \
+ FOP1E(op##w, ax) \
+ FOP1E(op##l, eax) \
+ ON64(FOP1E(op##q, rax)) \
+ FOP_END
+
+#define FOP2E(op, dst, src) \
+ FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
+
+#define FASTOP2(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, bl) \
+ FOP2E(op##w, ax, bx) \
+ FOP2E(op##l, eax, ebx) \
+ ON64(FOP2E(op##q, rax, rbx)) \
+ FOP_END
+
+/* 2 operand, word only */
+#define FASTOP2W(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP2E(op##w, ax, bx) \
+ FOP2E(op##l, eax, ebx) \
+ ON64(FOP2E(op##q, rax, rbx)) \
+ FOP_END
+
+/* 2 operand, src is CL */
+#define FASTOP2CL(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, cl) \
+ FOP2E(op##w, ax, cl) \
+ FOP2E(op##l, eax, cl) \
+ ON64(FOP2E(op##q, rax, cl)) \
+ FOP_END
+
+#define FOP3E(op, dst, src, src2) \
+ FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+
+/* 3-operand, word-only, src2=cl */
+#define FASTOP3WCL(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP3E(op##w, ax, bx, cl) \
+ FOP3E(op##l, eax, ebx, cl) \
+ ON64(FOP3E(op##q, rax, rbx, cl)) \
+ FOP_END
+
+/* Special case for SETcc - 1 instruction per cc */
+#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
+
+FOP_START(setcc)
+FOP_SETCC(seto)
+FOP_SETCC(setno)
+FOP_SETCC(setc)
+FOP_SETCC(setnc)
+FOP_SETCC(setz)
+FOP_SETCC(setnz)
+FOP_SETCC(setbe)
+FOP_SETCC(setnbe)
+FOP_SETCC(sets)
+FOP_SETCC(setns)
+FOP_SETCC(setp)
+FOP_SETCC(setnp)
+FOP_SETCC(setl)
+FOP_SETCC(setnl)
+FOP_SETCC(setle)
+FOP_SETCC(setnle)
+FOP_END;
+
#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
do { \
unsigned long _tmp; \
@@ -663,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
ulong la;
u32 lim;
u16 sel;
- unsigned cpl, rpl;
+ unsigned cpl;
la = seg_base(ctxt, addr.seg) + addr.ea;
switch (ctxt->mode) {
@@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
goto bad;
}
cpl = ctxt->ops->cpl(ctxt);
- if (ctxt->mode == X86EMUL_MODE_REAL)
- rpl = 0;
- else
- rpl = sel & 3;
- cpl = max(cpl, rpl);
if (!(desc.type & 8)) {
/* data segment */
if (cpl > desc.dpl)
@@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static int test_cc(unsigned int condition, unsigned int flags)
-{
- int rc = 0;
-
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
- break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
- break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
- break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
- break;
- }
-
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
+FASTOP2(add);
+FASTOP2(or);
+FASTOP2(adc);
+FASTOP2(sbb);
+FASTOP2(and);
+FASTOP2(sub);
+FASTOP2(xor);
+FASTOP2(cmp);
+FASTOP2(test);
+
+FASTOP3WCL(shld);
+FASTOP3WCL(shrd);
+
+FASTOP2W(imul);
+
+FASTOP1(not);
+FASTOP1(neg);
+FASTOP1(inc);
+FASTOP1(dec);
+
+FASTOP2CL(rol);
+FASTOP2CL(ror);
+FASTOP2CL(rcl);
+FASTOP2CL(rcr);
+FASTOP2CL(shl);
+FASTOP2CL(shr);
+FASTOP2CL(sar);
+
+FASTOP2W(bsf);
+FASTOP2W(bsr);
+FASTOP2W(bt);
+FASTOP2W(bts);
+FASTOP2W(btr);
+FASTOP2W(btc);
+
+static u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ u8 rc;
+ void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+
+ flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ asm("push %[flags]; popf; call *%[fastop]"
+ : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
+ return rc;
}
static void fetch_register_operand(struct operand *op)
@@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
ctxt->ops->put_fpu(ctxt);
}
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("fninit");
+ ctxt->ops->put_fpu(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fcw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("fnstcw %0": "+m"(fcw));
+ ctxt->ops->put_fpu(ctxt);
+
+ /* force 2 byte destination */
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("fnstsw %0": "+m"(fsw));
+ ctxt->ops->put_fpu(ctxt);
+
+ /* force 2 byte destination */
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
@@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ if (ctxt->d & NoWrite)
+ return X86EMUL_CONTINUE;
+
switch (ctxt->dst.type) {
case OP_REG:
write_register_operand(&ctxt->dst);
@@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_grp2(struct x86_emulate_ctxt *ctxt)
-{
- switch (ctxt->modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB(ctxt, "rol");
- break;
- case 1: /* ror */
- emulate_2op_SrcB(ctxt, "ror");
- break;
- case 2: /* rcl */
- emulate_2op_SrcB(ctxt, "rcl");
- break;
- case 3: /* rcr */
- emulate_2op_SrcB(ctxt, "rcr");
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB(ctxt, "sal");
- break;
- case 5: /* shr */
- emulate_2op_SrcB(ctxt, "shr");
- break;
- case 7: /* sar */
- emulate_2op_SrcB(ctxt, "sar");
- break;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_not(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.val = ~ctxt->dst.val;
- return X86EMUL_CONTINUE;
-}
-
-static int em_neg(struct x86_emulate_ctxt *ctxt)
-{
- emulate_1op(ctxt, "neg");
- return X86EMUL_CONTINUE;
-}
-
static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
{
u8 ex = 0;
@@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
int rc = X86EMUL_CONTINUE;
switch (ctxt->modrm_reg) {
- case 0: /* inc */
- emulate_1op(ctxt, "inc");
- break;
- case 1: /* dec */
- emulate_1op(ctxt, "dec");
- break;
case 2: /* call near abs */ {
long int old_eip;
old_eip = ctxt->_eip;
@@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
/* Save real source value, then compare EAX against destination. */
ctxt->src.orig_val = ctxt->src.val;
ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
- emulate_2op_SrcV(ctxt, "cmp");
+ fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
/* Success: write back to memory. */
@@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
ctxt->src.type = OP_IMM;
ctxt->src.val = 0;
ctxt->src.bytes = 1;
- emulate_2op_SrcV(ctxt, "or");
+ fastop(ctxt, em_or);
ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
if (cf)
ctxt->eflags |= X86_EFLAGS_CF;
@@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
static int em_call(struct x86_emulate_ctxt *ctxt)
{
long rel = ctxt->src.val;
@@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_add(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "add");
- return X86EMUL_CONTINUE;
-}
-
-static int em_or(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "or");
- return X86EMUL_CONTINUE;
-}
-
-static int em_adc(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "adc");
- return X86EMUL_CONTINUE;
-}
-
-static int em_sbb(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "sbb");
- return X86EMUL_CONTINUE;
-}
-
-static int em_and(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "and");
- return X86EMUL_CONTINUE;
-}
-
-static int em_sub(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "sub");
- return X86EMUL_CONTINUE;
-}
-
-static int em_xor(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "xor");
- return X86EMUL_CONTINUE;
-}
-
-static int em_cmp(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "cmp");
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_test(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "test");
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
static int em_xchg(struct x86_emulate_ctxt *ctxt)
{
/* Write back the register source. */
@@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_imul(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "imul");
- return X86EMUL_CONTINUE;
-}
-
static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = ctxt->src2.val;
- return em_imul(ctxt);
+ return fastop(ctxt, em_imul);
}
static int em_cwd(struct x86_emulate_ctxt *ctxt)
@@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_bt(struct x86_emulate_ctxt *ctxt)
-{
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- /* only subword offset */
- ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-
- emulate_2op_SrcV_nobyte(ctxt, "bt");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bts(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bts");
- return X86EMUL_CONTINUE;
-}
-
-static int em_btr(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "btr");
- return X86EMUL_CONTINUE;
-}
-
-static int em_btc(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "btc");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bsf(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bsf");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bsr(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bsr");
- return X86EMUL_CONTINUE;
-}
-
static int em_cpuid(struct x86_emulate_ctxt *ctxt)
{
u32 eax, ebx, ecx, edx;
@@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
#define II(_f, _e, _i) \
{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
#define IIP(_f, _e, _i, _p) \
@@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define D2bv(_f) D((_f) | ByteOp), D(_f)
#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
#define I2bvIP(_f, _e, _i, _p) \
IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
-#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
- I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
- I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
static const struct opcode group7_rm1[] = {
DI(SrcNone | Priv, monitor),
@@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = {
};
static const struct opcode group1[] = {
- I(Lock, em_add),
- I(Lock | PageTable, em_or),
- I(Lock, em_adc),
- I(Lock, em_sbb),
- I(Lock | PageTable, em_and),
- I(Lock, em_sub),
- I(Lock, em_xor),
- I(0, em_cmp),
+ F(Lock, em_add),
+ F(Lock | PageTable, em_or),
+ F(Lock, em_adc),
+ F(Lock, em_sbb),
+ F(Lock | PageTable, em_and),
+ F(Lock, em_sub),
+ F(Lock, em_xor),
+ F(NoWrite, em_cmp),
};
static const struct opcode group1A[] = {
I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
};
+static const struct opcode group2[] = {
+ F(DstMem | ModRM, em_rol),
+ F(DstMem | ModRM, em_ror),
+ F(DstMem | ModRM, em_rcl),
+ F(DstMem | ModRM, em_rcr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_shr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_sar),
+};
+
static const struct opcode group3[] = {
- I(DstMem | SrcImm, em_test),
- I(DstMem | SrcImm, em_test),
- I(DstMem | SrcNone | Lock, em_not),
- I(DstMem | SrcNone | Lock, em_neg),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcNone | Lock, em_not),
+ F(DstMem | SrcNone | Lock, em_neg),
I(SrcMem, em_mul_ex),
I(SrcMem, em_imul_ex),
I(SrcMem, em_div_ex),
@@ -3640,14 +3701,14 @@ static const struct opcode group3[] = {
};
static const struct opcode group4[] = {
- I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
- I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
+ F(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ F(ByteOp | DstMem | SrcNone | Lock, em_dec),
N, N, N, N, N, N,
};
static const struct opcode group5[] = {
- I(DstMem | SrcNone | Lock, em_grp45),
- I(DstMem | SrcNone | Lock, em_grp45),
+ F(DstMem | SrcNone | Lock, em_inc),
+ F(DstMem | SrcNone | Lock, em_dec),
I(SrcMem | Stack, em_grp45),
I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
I(SrcMem | Stack, em_grp45),
@@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { {
static const struct opcode group8[] = {
N, N, N, N,
- I(DstMem | SrcImmByte, em_bt),
- I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
- I(DstMem | SrcImmByte | Lock, em_btr),
- I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+ F(DstMem | SrcImmByte | NoWrite, em_bt),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ F(DstMem | SrcImmByte | Lock, em_btr),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
static const struct group_dual group9 = { {
@@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = {
I(0, em_mov), N, N, N,
};
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
- I6ALU(Lock, em_add),
+ F6ALU(Lock, em_add),
I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
/* 0x08 - 0x0F */
- I6ALU(Lock | PageTable, em_or),
+ F6ALU(Lock | PageTable, em_or),
I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
N,
/* 0x10 - 0x17 */
- I6ALU(Lock, em_adc),
+ F6ALU(Lock, em_adc),
I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
/* 0x18 - 0x1F */
- I6ALU(Lock, em_sbb),
+ F6ALU(Lock, em_sbb),
I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
/* 0x20 - 0x27 */
- I6ALU(Lock | PageTable, em_and), N, N,
+ F6ALU(Lock | PageTable, em_and), N, N,
/* 0x28 - 0x2F */
- I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
/* 0x30 - 0x37 */
- I6ALU(Lock, em_xor), N, N,
+ F6ALU(Lock, em_xor), N, N,
/* 0x38 - 0x3F */
- I6ALU(0, em_cmp), N, N,
+ F6ALU(NoWrite, em_cmp), N, N,
/* 0x40 - 0x4F */
- X16(D(DstReg)),
+ X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
/* 0x50 - 0x57 */
X8(I(SrcReg | Stack, em_push)),
/* 0x58 - 0x5F */
@@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = {
G(DstMem | SrcImm, group1),
G(ByteOp | DstMem | SrcImm | No64, group1),
G(DstMem | SrcImmByte, group1),
- I2bv(DstMem | SrcReg | ModRM, em_test),
+ F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
/* 0x88 - 0x8F */
I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
@@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = {
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
- I2bv(SrcSI | DstDI | String, em_cmp),
+ F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
/* 0xA8 - 0xAF */
- I2bv(DstAcc | SrcImm, em_test),
+ F2bv(DstAcc | SrcImm | NoWrite, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- I2bv(SrcAcc | DstDI | String, em_cmp),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
/* 0xB0 - 0xB7 */
X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
/* 0xB8 - 0xBF */
- X8(I(DstReg | SrcImm | Mov, em_mov)),
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
- D2bv(DstMem | SrcImmByte | ModRM),
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
I(ImplicitOps | Stack, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
@@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = {
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
- D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
- N, N, N, N,
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ N, I(DstAcc | SrcImmByte | No64, em_aad), N, N,
/* 0xD8 - 0xDF */
- N, N, N, N, N, N, N, N,
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
X3(I(SrcImmByte, em_loop)),
I(SrcImmByte, em_jcxz),
@@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = {
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
- II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+ II(ImplicitOps, em_cpuid, cpuid),
+ F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
DI(ImplicitOps, rsm),
- I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM),
- D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ D(ModRM), F(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
- I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xB8 - 0xBF */
N, N,
G(BitOp, group8),
- I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
- I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xC7 */
D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
case 4:
op->val = insn_fetch(s32, ctxt);
break;
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
+ break;
}
if (!sign_extension) {
switch (op->bytes) {
@@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
case OpImm:
rc = decode_imm(ctxt, op, imm_size(ctxt), true);
break;
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
+ break;
case OpMem8:
ctxt->memop.bytes = 1;
goto mem_common;
@@ -4222,6 +4354,12 @@ done_prefixes:
case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
}
break;
+ case Escape:
+ if (ctxt->modrm > 0xbf)
+ opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
+ else
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ break;
default:
return EMULATION_FAILED;
}
@@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
}
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+{
+ ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+ asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
+ : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
+ : "c"(ctxt->src2.val), [fastop]"S"(fop));
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+ return X86EMUL_CONTINUE;
+}
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
@@ -4483,6 +4631,13 @@ special_insn:
}
if (ctxt->execute) {
+ if (ctxt->d & Fastop) {
+ void (*fop)(struct fastop *) = (void *)ctxt->execute;
+ rc = fastop(ctxt, fop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
rc = ctxt->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -4493,12 +4648,6 @@ special_insn:
goto twobyte_insn;
switch (ctxt->b) {
- case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op(ctxt, "inc");
- break;
- case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op(ctxt, "dec");
- break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
@@ -4523,9 +4672,6 @@ special_insn:
case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
}
break;
- case 0xc0 ... 0xc1:
- rc = em_grp2(ctxt);
- break;
case 0xcc: /* int3 */
rc = emulate_int(ctxt, 3);
break;
@@ -4536,13 +4682,6 @@ special_insn:
if (ctxt->eflags & EFLG_OF)
rc = emulate_int(ctxt, 4);
break;
- case 0xd0 ... 0xd1: /* Grp2 */
- rc = em_grp2(ctxt);
- break;
- case 0xd2 ... 0xd3: /* Grp2 */
- ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
- rc = em_grp2(ctxt);
- break;
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
jmp_rel(ctxt, ctxt->src.val);
@@ -4661,14 +4800,6 @@ twobyte_insn:
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
- case 0xa4: /* shld imm8, r, r/m */
- case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl(ctxt, "shld");
- break;
- case 0xac: /* shrd imm8, r, r/m */
- case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl(ctxt, "shrd");
- break;
case 0xae: /* clflush */
break;
case 0xb6 ... 0xb7: /* movzx */
@@ -4682,7 +4813,7 @@ twobyte_insn:
(s16) ctxt->src.val;
break;
case 0xc0 ... 0xc1: /* xadd */
- emulate_2op_SrcV(ctxt, "add");
+ fastop(ctxt, em_add);
/* Write back the register source. */
ctxt->src.val = ctxt->dst.orig_val;
write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 11300d2fa714..c1d30b2fc9bb 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm)
*/
remaining = hrtimer_get_remaining(&ps->timer);
elapsed = ps->period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->period);
return elapsed;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 848206df0967..cc31f7c06d3d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
+ s->output = 0;
+
pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1618bd..484bc874688b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+static int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ if (kvm_apic_accept_pic_intr(v))
+ return pic_irqchip(v->kvm)->output; /* PIC */
+ else
+ return 0;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.pending;
+
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (kvm_apic_vid_enabled(v->kvm))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+
+/*
* check if there is pending interrupt without
* intack.
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
-
if (!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.pending;
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm); /* PIC */
- return s->output;
- } else
- return 0;
- }
- return 1;
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
}
EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+ return -1;
+}
+
+/*
* Read pending interrupt vector and intack.
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
int vector;
if (!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.nr;
- vector = kvm_get_apic_interrupt(v); /* APIC */
- if (vector == -1) {
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm);
- s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(v->kvm);
- }
- }
- return vector;
+ vector = kvm_cpu_get_extint(v);
+
+ if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ return vector; /* PIC */
+
+ return kvm_get_apic_interrupt(v); /* APIC */
}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f527f107..02b51dd4e4ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
- return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
static inline int kvm_apic_id(struct kvm_lapic *apic)
{
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
-static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_irq *irq,
+ u64 *eoi_exit_bitmap)
{
- u16 cid;
- ldr >>= 32 - map->ldr_bits;
- cid = (ldr >> map->cid_shift) & map->cid_mask;
+ struct kvm_lapic **dst;
+ struct kvm_apic_map *map;
+ unsigned long bitmap = 1;
+ int i;
- BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+ rcu_read_lock();
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
- return cid;
-}
+ if (unlikely(!map)) {
+ __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
+ goto out;
+ }
-static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
-{
- ldr >>= (32 - map->ldr_bits);
- return ldr & map->lid_mask;
+ if (irq->dest_mode == 0) { /* physical mode */
+ if (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->dest_id == 0xff) {
+ __set_bit(irq->vector,
+ (unsigned long *)eoi_exit_bitmap);
+ goto out;
+ }
+ dst = &map->phys_map[irq->dest_id & 0xff];
+ } else {
+ u32 mda = irq->dest_id << (32 - map->ldr_bits);
+
+ dst = map->logical_map[apic_cluster_id(map, mda)];
+
+ bitmap = apic_logical_id(map, mda);
+ }
+
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (dst[i]->vcpu == vcpu) {
+ __set_bit(irq->vector,
+ (unsigned long *)eoi_exit_bitmap);
+ break;
+ }
+ }
+
+out:
+ rcu_read_unlock();
}
static void recalculate_apic_map(struct kvm *kvm)
@@ -230,6 +255,8 @@ out:
if (old)
kfree_rcu(old, rcu);
+
+ kvm_ioapic_make_eoibitmap_request(kvm);
}
static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
if (!apic->irr_pending)
return -1;
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
{
int result;
+
+ /* Note that isr_count is always 1 with vid enabled */
if (!apic->isr_count)
return -1;
if (likely(apic->highest_isr_cache != -1))
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+ int trigger_mode;
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ }
+}
+
static int apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
- kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
- int trigger_mode;
- if (apic_test_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
- }
+ kvm_ioapic_send_eoi(apic, vector);
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
return vector;
}
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
static void apic_send_ipi(struct kvm_lapic *apic)
{
u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ u32 val = 0;
+
+ /* hw has done the conditional check and inst decode */
+ offset &= 0xff0;
+
+ apic_reg_read(vcpu->arch.apic, offset, 4, &val);
+
+ /* TODO: optimize to just emulate side effect w/o one more write */
+ apic_reg_write(vcpu->arch.apic, offset, val);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
+ u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic) {
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
value &= ~MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base = value;
- if (apic_x2apic_mode(apic)) {
- u32 id = kvm_apic_id(apic);
- u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
- kvm_apic_set_ldr(apic, ldr);
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE) {
+ u32 id = kvm_apic_id(apic);
+ u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ kvm_apic_set_ldr(apic, ldr);
+ kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
+ } else
+ kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
}
+
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = false;
- apic->isr_count = 0;
+ apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+ apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
- apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
+ 1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f3571f..1676d34ddb4e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
}
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+{
+ return kvm_x86_ops->vm_has_apicv(kvm);
+}
+
+static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+{
+ u16 cid;
+ ldr >>= 32 - map->ldr_bits;
+ cid = (ldr >> map->cid_shift) & map->cid_mask;
+
+ BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+
+ return cid;
+}
+
+static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
+{
+ ldr >>= (32 - map->ldr_bits);
+ return ldr & map->lid_mask;
+}
+
+void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_irq *irq,
+ u64 *eoi_bitmap);
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2ad05f5..4ed3edbe06bd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
static bool spte_is_locklessly_modifiable(u64 spte)
{
- return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+ return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+ (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
}
static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
- max_level = kvm_x86_ops->get_lpage_level() < host_level ?
- kvm_x86_ops->get_lpage_level() : host_level;
+ max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
}
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
- int level, bool pt_protect)
+ bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
+ __rmap_write_protect(kvm, rmapp, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
for (i = PT_PAGE_TABLE_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
+ write_protected |= __rmap_write_protect(kvm, rmapp, true);
}
return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-/*
- * Remove the sp from shadow page cache, after call it,
- * we can not find this sp from the cache, and the shadow
- * page table is still valid.
- * It should be under the protection of mmu lock.
- */
-static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
- if (!sp->role.direct)
- free_page((unsigned long)sp->gfns);
-}
-
-/*
- * Free the shadow page table and the sp, we can do it
- * out of the protection of mmu lock.
- */
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
-{
list_del(&sp->link);
free_page((unsigned long)sp->spt);
+ if (!sp->role.direct)
+ free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
sp->parent_ptes = 0;
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
u64 spte;
- spte = __pa(sp->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
+ spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+
mmu_spte_set(sptep, spte);
}
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
}
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
* change the value
*/
+ spin_lock(&kvm->mmu_lock);
+
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
!list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
}
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ spin_unlock(&kvm->mmu_lock);
}
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
- int slot = memslot_id(kvm, gfn);
- struct kvm_mmu_page *sp = page_header(__pa(pte));
-
- __set_bit(slot, sp->slot_bitmap);
-}
-
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
if (s->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
- if (!need_unsync && !s->unsync) {
+ if (!s->unsync)
need_unsync = true;
- }
}
if (need_unsync)
kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
}
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int user_fault,
- int write_fault, int level,
+ unsigned pte_access, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
- if ((pte_access & ACC_WRITE_MASK)
- || (!vcpu->arch.mmu.direct_map && write_fault
- && !is_write_protection(vcpu) && !user_fault)) {
+ if (pte_access & ACC_WRITE_MASK) {
/*
- * There are two cases:
- * - the one is other vcpu creates new sp in the window
- * between mapping_level() and acquiring mmu-lock.
- * - the another case is the new sp is created by itself
- * (page-fault path) when guest uses the target gfn as
- * its page table.
- * Both of these cases can be fixed by allowing guest to
- * retry the access, it will refault, then we can establish
- * the mapping by using small page.
+ * Other vcpu creates new sp in the window between
+ * mapping_level() and acquiring mmu-lock. We can
+ * allow guest to retry the access, the mapping can
+ * be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
- if (!vcpu->arch.mmu.direct_map
- && !(pte_access & ACC_WRITE_MASK)) {
- spte &= ~PT_USER_MASK;
- /*
- * If we converted a user page to a kernel page,
- * so that the kernel can write to it when cr0.wp=0,
- * then we should prevent the kernel from executing it
- * if SMEP is enabled.
- */
- if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
- spte |= PT64_NX_MASK;
- }
-
/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
}
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault,
- int *emulate, int level, gfn_t gfn,
- pfn_t pfn, bool speculative,
+ unsigned pte_access, int write_fault, int *emulate,
+ int level, gfn_t gfn, pfn_t pfn, bool speculative,
bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %llx\n",
- __func__, *sptep, pt_access,
- write_fault, user_fault, gfn);
+ pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
+ *sptep, write_fault, gfn);
if (is_rmap_spte(*sptep)) {
/*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
was_rmapped = 1;
}
- if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- level, gfn, pfn, speculative, true,
- host_writable)) {
+ if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
+ true, host_writable)) {
if (write_fault)
*emulate = 1;
kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
++vcpu->kvm->stat.lpages;
if (is_shadow_present_pte(*sptep)) {
- page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, NULL,
- sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
+ mmu_set_spte(vcpu, start, access, 0, NULL,
+ sp->role.level, gfn, page_to_pfn(pages[i]),
+ true, true);
return 0;
}
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- unsigned pte_access = ACC_ALL;
-
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, &emulate,
- level, gfn, pfn, prefault, map_writable);
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+ write, &emulate, level, gfn, pfn,
+ prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
- mmu_spte_set(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
+ link_shadow_page(iterator.sptep, sp);
}
}
return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
else
r = paging32_init_context(vcpu, context);
+ vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
+ r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
if (r)
gentry = 0;
new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
& mask.word) && rmap_can_add(vcpu))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- if (!remote_flush && need_remote_flush(entry, *spte))
+ if (need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
}
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
- struct kvm_mmu_page *sp;
- bool flush = false;
+ struct kvm_memory_slot *memslot;
+ gfn_t last_gfn;
+ int i;
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
- int i;
- u64 *pt;
+ memslot = id_to_memslot(kvm->memslots, slot);
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
- if (!test_bit(slot, sp->slot_bitmap))
- continue;
+ spin_lock(&kvm->mmu_lock);
- pt = sp->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(pt[i]) ||
- !is_last_spte(pt[i], sp->role.level))
- continue;
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long *rmapp;
+ unsigned long last_index, index;
- spte_write_protect(kvm, &pt[i], &flush, false);
+ rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ __rmap_write_protect(kvm, rmapp, false);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_flush_remote_tlbs(kvm);
+ cond_resched_lock(&kvm->mmu_lock);
+ }
}
}
+
kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index cd6e98333ba3..b8f6172f4174 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
);
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
TRACE_EVENT(
mark_mmio_spte,
TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 891eb6d93b8b..105dd5bd550e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty, shift;
+ unsigned index, pt_access, pte_access, accessed_dirty;
gpa_t pte_gpa;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
@@ -249,16 +249,12 @@ retry_walk:
if (!write_fault)
protect_clean_gpte(&pte_access, pte);
-
- /*
- * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
- * place right.
- *
- * On a read fault, do nothing.
- */
- shift = write_fault >> ilog2(PFERR_WRITE_MASK);
- shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
- accessed_dirty &= pte >> shift;
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty by
+ * shifting it one place right.
+ */
+ accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -330,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
+ mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
+ gfn, pfn, true, true);
return true;
}
@@ -405,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
*/
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
- int user_fault, int write_fault, int hlevel,
+ int write_fault, int hlevel,
pfn_t pfn, bool map_writable, bool prefault)
{
struct kvm_mmu_page *sp = NULL;
@@ -413,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
unsigned direct_access, access = gw->pt_access;
int top_level, emulate = 0;
- if (!is_present_gpte(gw->ptes[gw->level - 1]))
- return 0;
-
direct_access = gw->pte_access;
top_level = vcpu->arch.mmu.root_level;
@@ -477,9 +470,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
clear_sp_write_flooding_count(it.sptep);
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, &emulate, it.level,
- gw->gfn, pfn, prefault, map_writable);
+ mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return emulate;
@@ -491,6 +483,46 @@ out_gpte_changed:
return 0;
}
+ /*
+ * To see whether the mapped gfn can write its page table in the current
+ * mapping.
+ *
+ * It is the helper function of FNAME(page_fault). When guest uses large page
+ * size to map the writable gfn which is used as current page table, we should
+ * force kvm to use small page size to map it because new shadow page will be
+ * created when kvm establishes shadow page table that stop kvm using large
+ * page size. Do it early can avoid unnecessary #PF and emulation.
+ *
+ * @write_fault_to_shadow_pgtable will return true if the fault gfn is
+ * currently used as its page table.
+ *
+ * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
+ * since the PDPT is always shadowed, that means, we can not use large page
+ * size to map the gfn which is used as PDPT.
+ */
+static bool
+FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
+ struct guest_walker *walker, int user_fault,
+ bool *write_fault_to_shadow_pgtable)
+{
+ int level;
+ gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+ bool self_changed = false;
+
+ if (!(walker->pte_access & ACC_WRITE_MASK ||
+ (!is_write_protection(vcpu) && !user_fault)))
+ return false;
+
+ for (level = walker->level; level <= walker->max_level; level++) {
+ gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
+
+ self_changed |= !(gfn & mask);
+ *write_fault_to_shadow_pgtable |= !gfn;
+ }
+
+ return self_changed;
+}
+
/*
* Page fault handler. There are several causes for a page fault:
* - there is no shadow pte for the guest pte
@@ -516,7 +548,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int level = PT_PAGE_TABLE_LEVEL;
int force_pt_level;
unsigned long mmu_seq;
- bool map_writable;
+ bool map_writable, is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -544,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
}
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+
+ is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
+ &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+
if (walker.level >= PT_DIRECTORY_LEVEL)
- force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
+ || is_self_change_mapping;
else
force_pt_level = 1;
if (!force_pt_level) {
@@ -564,6 +602,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
walker.gfn, pfn, walker.pte_access, &r))
return r;
+ /*
+ * Do not change pte_access if the pfn is a mmio page, otherwise
+ * we will cache the incorrect access into mmio spte.
+ */
+ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_write_protection(vcpu) && !user_fault &&
+ !is_noslot_pfn(pfn)) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
@@ -572,7 +630,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
- r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
level, pfn, map_writable, prefault);
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
@@ -747,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
- set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+ set_spte(vcpu, &sp->spt[i], pte_access,
PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd1c156..e1b1ce21bc00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
+static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+ return;
+}
+
+static int svm_vm_has_apicv(struct kvm *kvm)
+{
+ return 0;
+}
+
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ return;
+}
+
+static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+ return;
+}
+
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+ .vm_has_apicv = svm_vm_has_apicv,
+ .load_eoi_exitmap = svm_load_eoi_exitmap,
+ .hwapic_isr_update = svm_hwapic_isr_update,
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9120ae1901e4..6667042714cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
+static bool __read_mostly enable_apicv_reg_vid;
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -92,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO);
static bool __read_mostly nested = 0;
module_param(nested, bool, S_IRUGO);
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
- (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE)
+#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
@@ -624,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
+static bool guest_state_valid(struct kvm_vcpu *vcpu);
+static u32 vmx_segment_access_rights(struct kvm_segment *var);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -638,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic;
static bool cpu_has_load_ia32_efer;
static bool cpu_has_load_perf_global_ctrl;
@@ -762,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
static inline bool cpu_has_vmx_flexpriority(void)
{
return cpu_has_vmx_tpr_shadow() &&
@@ -1694,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
@@ -1820,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
vmx->guest_msrs[from] = tmp;
}
+static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ unsigned long *msr_bitmap;
+
+ if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ } else {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy;
+ }
+
+ vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+}
+
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -1828,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs, index;
- unsigned long *msr_bitmap;
save_nmsrs = 0;
#ifdef CONFIG_X86_64
@@ -1860,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;
- if (cpu_has_vmx_msr_bitmap()) {
- if (is_long_mode(&vmx->vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode;
- else
- msr_bitmap = vmx_msr_bitmap_legacy;
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
- }
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_set_msr_bitmap(&vmx->vcpu);
}
/*
@@ -2533,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min2 = 0;
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_RDTSCP |
- SECONDARY_EXEC_ENABLE_INVPCID;
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2550,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
+
+ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_2nd_exec_control &= ~(
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
@@ -2747,6 +2788,15 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
+ if (!cpu_has_vmx_apic_register_virt() ||
+ !cpu_has_vmx_virtual_intr_delivery())
+ enable_apicv_reg_vid = 0;
+
+ if (enable_apicv_reg_vid)
+ kvm_x86_ops->update_cr8_intercept = NULL;
+ else
+ kvm_x86_ops->hwapic_irr_update = NULL;
+
if (nested)
nested_vmx_setup_ctls_msrs();
@@ -2758,18 +2808,28 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
}
-static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
+static bool emulation_required(struct kvm_vcpu *vcpu)
{
- const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- struct kvm_segment tmp = *save;
+ return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
- if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
- tmp.base = vmcs_readl(sf->base);
- tmp.selector = vmcs_read16(sf->selector);
- tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
- tmp.s = 1;
+static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
+ struct kvm_segment *save)
+{
+ if (!emulate_invalid_guest_state) {
+ /*
+ * CS and SS RPL should be equal during guest entry according
+ * to VMX spec, but in reality it is not always so. Since vcpu
+ * is in the middle of the transition from real mode to
+ * protected mode it is safe to assume that RPL 0 is a good
+ * default value.
+ */
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
+ save->selector &= ~SELECTOR_RPL_MASK;
+ save->dpl = save->selector & SELECTOR_RPL_MASK;
+ save->s = 1;
}
- vmx_set_segment(vcpu, &tmp, seg);
+ vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2777,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->emulation_required = 1;
+ /*
+ * Update real mode segment cache. It may be not up-to-date if sement
+ * register was written while vcpu was in a guest mode.
+ */
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
vmx->rmode.vm86_active = 0;
vmx_segment_cache_clear(vmx);
@@ -2794,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
update_exception_bitmap(vcpu);
- if (emulate_invalid_guest_state)
- return;
-
- fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
- fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
- fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
- fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-
- vmx_segment_cache_clear(vmx);
+ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
- vmcs_write16(GUEST_SS_SELECTOR, 0);
- vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+ /* CPL is always 0 when CPU enters protected mode */
+ __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ vmx->cpl = 0;
}
static gva_t rmode_tss_base(struct kvm *kvm)
@@ -2831,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm)
static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xffff0);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
- if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not paragraph"
- " aligned when entering protected mode (seg=%d)",
- seg);
+ struct kvm_segment var = *save;
+
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+
+ if (!emulate_invalid_guest_state) {
+ var.selector = var.base >> 4;
+ var.base = var.base & 0xffff0;
+ var.limit = 0xffff;
+ var.g = 0;
+ var.db = 0;
+ var.present = 1;
+ var.s = 1;
+ var.l = 0;
+ var.unusable = 0;
+ var.type = 0x3;
+ var.avl = 0;
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not "
+ "paragraph aligned when entering "
+ "protected mode (seg=%d)", seg);
+ }
+
+ vmcs_write16(sf->selector, var.selector);
+ vmcs_write32(sf->base, var.base);
+ vmcs_write32(sf->limit, var.limit);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_segment var;
-
- if (enable_unrestricted_guest)
- return;
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
- vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
-
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2888,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
- if (emulate_invalid_guest_state)
- goto continue_rmode;
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
- vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
+ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
- vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
-
-continue_rmode:
kvm_mmu_reset_context(vcpu);
}
@@ -3068,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long hw_cr0;
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
if (enable_unrestricted_guest)
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
- | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
- else
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else {
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
- if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
- enter_pmode(vcpu);
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
- if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
- enter_rmode(vcpu);
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+ }
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
@@ -3098,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+
+ /* depends on vcpu->arch.cr0 to be set to a new value */
+ vmx->emulation_required = emulation_required(vcpu);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -3155,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
+ /*
+ * SMEP is disabled if CPU is in non-paging mode in
+ * hardware. However KVM always uses paging mode to
+ * emulate guest non-paging mode with TDP.
+ * To emulate this behavior, SMEP needs to be manually
+ * disabled when guest switches to non-paging mode.
+ */
+ hw_cr4 &= ~X86_CR4_SMEP;
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
@@ -3171,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 ar;
- if (vmx->rmode.vm86_active
- && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
- || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
- || seg == VCPU_SREG_GS)) {
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
*var = vmx->rmode.segs[seg];
if (seg == VCPU_SREG_TR
|| var->selector == vmx_read_guest_seg_selector(vmx, seg))
@@ -3187,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
- if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
- ar = 0;
var->type = ar & 15;
var->s = (ar >> 4) & 1;
var->dpl = (ar >> 5) & 3;
@@ -3211,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
if (!is_protmode(vcpu))
return 0;
@@ -3220,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
return 3;
- return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
-}
-
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /*
- * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
- * fail; use the cache instead.
- */
- if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
- return vmx->cpl;
- }
-
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- vmx->cpl = __vmx_get_cpl(vcpu);
+ vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
}
return vmx->cpl;
@@ -3269,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
vmx_segment_cache_clear(vmx);
+ if (seg == VCPU_SREG_CS)
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
- vmcs_write16(sf->selector, var->selector);
- vmx->rmode.segs[VCPU_SREG_TR] = *var;
- return;
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ vmx->rmode.segs[seg] = *var;
+ if (seg == VCPU_SREG_TR)
+ vmcs_write16(sf->selector, var->selector);
+ else if (var->s)
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
+ goto out;
}
+
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vmx->rmode.vm86_active && var->s) {
- vmx->rmode.segs[seg] = *var;
- /*
- * Hack real-mode segments into vm86 compatibility.
- */
- if (var->base == 0xffff0000 && var->selector == 0xf000)
- vmcs_writel(sf->base, 0xf0000);
- ar = 0xf3;
- } else
- ar = vmx_segment_access_rights(var);
/*
* Fix the "Accessed" bit in AR field of segment registers for older
@@ -3304,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* kvm hack.
*/
if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
- ar |= 0x1; /* Accessed */
+ var->type |= 0x1; /* Accessed */
- vmcs_write32(sf->ar_bytes, ar);
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
- /*
- * Fix segments for real mode guest in hosts that don't have
- * "unrestricted_mode" or it was disabled.
- * This is done to allow migration of the guests from hosts with
- * unrestricted guest like Westmere to older host that don't have
- * unrestricted guest like Nehelem.
- */
- if (vmx->rmode.vm86_active) {
- switch (seg) {
- case VCPU_SREG_CS:
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_readl(GUEST_CS_BASE) >> 4);
- break;
- case VCPU_SREG_ES:
- case VCPU_SREG_DS:
- case VCPU_SREG_GS:
- case VCPU_SREG_FS:
- fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- break;
- case VCPU_SREG_SS:
- vmcs_write16(GUEST_SS_SELECTOR,
- vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
- break;
- }
- }
+out:
+ vmx->emulation_required |= emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3380,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
u32 ar;
vmx_get_segment(vcpu, &var, seg);
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
ar = vmx_segment_access_rights(&var);
if (var.base != (var.selector << 4))
return false;
- if (var.limit < 0xffff)
+ if (var.limit != 0xffff)
return false;
- if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
+ if (ar != 0xf3)
return false;
return true;
@@ -3521,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
*/
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
+ if (enable_unrestricted_guest)
+ return true;
+
/* real mode guest state checks */
if (!is_protmode(vcpu)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3644,12 +3672,9 @@ static void seg_setup(int seg)
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- if (enable_unrestricted_guest) {
- ar = 0x93;
- if (seg == VCPU_SREG_CS)
- ar |= 0x08; /* code segment */
- } else
- ar = 0xf3;
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
vmcs_write32(sf->ar_bytes, ar);
}
@@ -3667,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
if (r)
goto out;
@@ -3697,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm_userspace_mem.guest_phys_addr =
kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
if (r)
goto out;
@@ -3739,7 +3764,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
spin_unlock(&vmx_vpid_lock);
}
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -3752,20 +3780,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
if (msr <= 0x1fff) {
- __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
- __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __clear_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
}
}
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
{
if (!longmode_only)
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+ msr, MSR_TYPE_R | MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+ msr, MSR_TYPE_R | MSR_TYPE_W);
+}
+
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+{
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+}
+
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+{
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+}
+
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+{
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_W);
}
/*
@@ -3844,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+ return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
+}
+
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3861,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
return exec_control;
}
@@ -3905,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx_secondary_exec_control(vmx));
}
+ if (enable_apicv_reg_vid) {
+ vmcs_write64(EOI_EXIT_BITMAP0, 0);
+ vmcs_write64(EOI_EXIT_BITMAP1, 0);
+ vmcs_write64(EOI_EXIT_BITMAP2, 0);
+ vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+ vmcs_write16(GUEST_INTR_STATUS, 0);
+ }
+
if (ple_gap) {
vmcs_write32(PLE_GAP, ple_gap);
vmcs_write32(PLE_WINDOW, ple_window);
@@ -3990,14 +4109,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx_segment_cache_clear(vmx);
seg_setup(VCPU_SREG_CS);
- /*
- * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
- * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
- */
- if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_writel(GUEST_CS_BASE, 0x000f0000);
- } else {
+ else {
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
}
@@ -4073,9 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
ret = 0;
- /* HACK: Don't enable emulation on guest boot/reset */
- vmx->emulation_required = 0;
-
return ret;
}
@@ -4251,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
.flags = 0,
};
- ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+ ret = kvm_set_memory_region(kvm, &tss_mem, false);
if (ret)
return ret;
kvm->arch.tss_addr = addr;
@@ -4261,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return 0;
}
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
- int vec, u32 err_code)
+static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
{
- /*
- * Instruction with address size override prefix opcode 0x67
- * Cause the #SS fault with 0 error code in VM86 mode.
- */
- if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
- return 1;
- /*
- * Forward all other exceptions that are valid in real mode.
- * FIXME: Breaks guest debugging in real mode, needs to be fixed with
- * the required debugging infrastructure rework.
- */
switch (vec) {
- case DB_VECTOR:
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return 0;
- kvm_queue_exception(vcpu, vec);
- return 1;
case BP_VECTOR:
/*
* Update instruction length as we may reinject the exception
@@ -4291,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- return 0;
+ return false;
+ /* fall through */
+ case DB_VECTOR:
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
/* fall through */
case DE_VECTOR:
case OF_VECTOR:
@@ -4301,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
case SS_VECTOR:
case GP_VECTOR:
case MF_VECTOR:
- kvm_queue_exception(vcpu, vec);
- return 1;
+ return true;
+ break;
}
- return 0;
+ return false;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
+ if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt(vcpu);
+ }
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ kvm_queue_exception(vcpu, vec);
+ return 1;
}
/*
@@ -4392,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
}
- if (vmx->rmode.vm86_active &&
- handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
- error_code)) {
- if (vcpu->arch.halt_request) {
- vcpu->arch.halt_request = 0;
- return kvm_emulate_halt(vcpu);
- }
- return 1;
- }
-
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
+ return handle_rmode_exception(vcpu, ex_no, error_code);
+
switch (ex_no) {
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
@@ -4820,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int vector = exit_qualification & 0xff;
+
+ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_set_eoi_accelerated(vcpu, vector);
+ return 1;
+}
+
+static int handle_apic_write(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 offset = exit_qualification & 0xfff;
+
+ /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5065,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = !guest_state_valid(vcpu);
+ vmx->emulation_required = emulation_required(vcpu);
out:
return ret;
}
@@ -5754,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_VMON] = handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
+ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
@@ -5780,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
gpa_t bitmap;
- if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return 1;
/*
@@ -6008,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 vectoring_info = vmx->idt_vectoring_info;
/* If guest state is invalid, start emulating */
- if (vmx->emulation_required && emulate_invalid_guest_state)
+ if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
/*
@@ -6103,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
vmcs_write32(TPR_THRESHOLD, irr);
}
+static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+ u32 sec_exec_control;
+
+ /*
+ * There is not point to enable virtualize x2apic without enable
+ * apicv
+ */
+ if (!cpu_has_vmx_virtualize_x2apic_mode() ||
+ !vmx_vm_has_apicv(vcpu->kvm))
+ return;
+
+ if (!vm_need_tpr_shadow(vcpu->kvm))
+ return;
+
+ sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ if (set) {
+ sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ } else {
+ sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ }
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+ vmx_set_msr_bitmap(vcpu);
+}
+
+static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+ u16 status;
+ u8 old;
+
+ if (!vmx_vm_has_apicv(kvm))
+ return;
+
+ if (isr == -1)
+ isr = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = status >> 8;
+ if (isr != old) {
+ status &= 0xff;
+ status |= isr << 8;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_set_rvi(int vector)
+{
+ u16 status;
+ u8 old;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = (u8)status & 0xff;
+ if ((u8)vector != old) {
+ status &= ~0xff;
+ status |= (u8)vector;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+ if (max_irr == -1)
+ return;
+
+ vmx_set_rvi(max_irr);
+}
+
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -6291,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
- if (vmx->emulation_required && emulate_invalid_guest_state)
+ if (vmx->emulation_required)
return;
if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -7366,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+ .vm_has_apicv = vmx_vm_has_apicv,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
@@ -7398,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
static int __init vmx_init(void)
{
- int r, i;
+ int r, i, msr;
rdmsrl_safe(MSR_EFER, &host_efer);
@@ -7419,11 +7643,19 @@ static int __init vmx_init(void)
if (!vmx_msr_bitmap_legacy)
goto out1;
+ vmx_msr_bitmap_legacy_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic)
+ goto out2;
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode)
- goto out2;
+ goto out3;
+ vmx_msr_bitmap_longmode_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic)
+ goto out4;
/*
* Allow direct access to the PC debug port (it is often used for I/O
@@ -7455,6 +7687,28 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
+
+ if (enable_apicv_reg_vid) {
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ vmx_disable_intercept_msr_read_x2apic(msr);
+
+ /* According SDM, in x2apic mode, the whole id reg is used.
+ * But in KVM, it only use the highest eight bits. Need to
+ * intercept it */
+ vmx_enable_intercept_msr_read_x2apic(0x802);
+ /* TMCCT */
+ vmx_enable_intercept_msr_read_x2apic(0x839);
+ /* TPR */
+ vmx_disable_intercept_msr_write_x2apic(0x808);
+ /* EOI */
+ vmx_disable_intercept_msr_write_x2apic(0x80b);
+ /* SELF-IPI */
+ vmx_disable_intercept_msr_write_x2apic(0x83f);
+ }
if (enable_ept) {
kvm_mmu_set_mask_ptes(0ull,
@@ -7468,8 +7722,10 @@ static int __init vmx_init(void)
return 0;
-out3:
+out4:
free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
out2:
free_page((unsigned long)vmx_msr_bitmap_legacy);
out1:
@@ -7481,6 +7737,8 @@ out:
static void __exit vmx_exit(void)
{
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
free_page((unsigned long)vmx_msr_bitmap_legacy);
free_page((unsigned long)vmx_msr_bitmap_longmode);
free_page((unsigned long)vmx_io_bitmap_b);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81e3c74..f71500af1f81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
- vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
kvm_mmu_reset_context(vcpu);
@@ -1881,6 +1879,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u64 data = msr_info->data;
switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ break;
+
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@@ -1900,8 +1906,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case MSR_AMD64_NB_CFG:
- break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
@@ -1914,11 +1918,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_UCODE_WRITE:
- case MSR_VM_HSAVE_PA:
- case MSR_AMD64_PATCH_LOADER:
- break;
case 0x200 ... 0x2ff:
return set_msr_mtrr(vcpu, msr, data);
case MSR_IA32_APICBASE:
@@ -2253,6 +2252,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
data = 0;
break;
case MSR_P6_PERFCTR0:
@@ -2520,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_NR_MEMSLOTS:
- r = KVM_MEMORY_SLOTS;
+ r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -3272,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return -EINVAL;
mutex_lock(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -3437,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
mutex_lock(&kvm->slots_lock);
r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
+ if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4493,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
*selector = var.selector;
- if (var.unusable)
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
return false;
+ }
if (var.g)
var.limit >>= 12;
@@ -4755,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
return r;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+ bool write_fault_to_shadow_pgtable)
{
- gpa_t gpa;
+ gpa_t gpa = cr2;
pfn_t pfn;
- if (tdp_enabled)
- return false;
-
- /*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
- */
- if (kvm_mmu_unprotect_page_virt(vcpu, gva))
- return true;
-
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+ if (!vcpu->arch.mmu.direct_map) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
- if (gpa == UNMAPPED_GVA)
- return true; /* let cpu generate fault */
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == UNMAPPED_GVA)
+ return true;
+ }
/*
* Do not retry the unhandleable instruction if it faults on the
@@ -4783,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
* instruction -> ...
*/
pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
- if (!is_error_noslot_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu.direct_map) {
+ unsigned int indirect_shadow_pages;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
return true;
}
- return false;
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !write_fault_to_shadow_pgtable;
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4830,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (!vcpu->arch.mmu.direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
- kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
return true;
}
@@ -4847,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
bool writeback = true;
+ bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ /*
+ * Clear write_fault_to_shadow_pgtable here to ensure it is
+ * never reused.
+ */
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4866,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (r != EMULATION_OK) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2,
+ write_fault_to_spt))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
@@ -4896,7 +4934,7 @@ restart:
return EMULATE_DONE;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
return EMULATE_DONE;
return handle_emulation_failure(vcpu);
@@ -5539,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
- } else if (kvm_cpu_has_interrupt(vcpu)) {
+ } else if (kvm_cpu_has_injectable_intr(vcpu)) {
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
@@ -5607,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
#endif
}
+static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ memset(eoi_exit_bitmap, 0, 32);
+
+ kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
@@ -5660,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_handle_pmu_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
+ update_eoi_exitmap(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5668,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* enable NMI/IRQ window open exits if needed */
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -6851,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
struct kvm_userspace_memory_region *mem,
- int user_alloc)
+ bool user_alloc)
{
int npages = memslot->npages;
- int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
- /* Prevent internal slot pages from being moved by fork()/COW. */
- if (memslot->id >= KVM_MEMORY_SLOTS)
- map_flags = MAP_SHARED | MAP_ANONYMOUS;
- /*To keep backward compatibility with older userspace,
- *x86 needs to handle !user_alloc case.
+ /*
+ * Only private memory slots need to be mapped here since
+ * KVM_SET_MEMORY_REGION ioctl is no longer supported.
*/
- if (!user_alloc) {
- if (npages && !old.npages) {
- unsigned long userspace_addr;
+ if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+ unsigned long userspace_addr;
- userspace_addr = vm_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- map_flags,
- 0);
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *)userspace_addr))
- return PTR_ERR((void *)userspace_addr);
+ if (IS_ERR((void *)userspace_addr))
+ return PTR_ERR((void *)userspace_addr);
- memslot->userspace_addr = userspace_addr;
- }
+ memslot->userspace_addr = userspace_addr;
}
-
return 0;
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
- int user_alloc)
+ bool user_alloc)
{
int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
- if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+ if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
int ret;
ret = vm_munmap(old.userspace_addr,
@@ -6906,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (!kvm->arch.n_requested_mmu_pages)
nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
- spin_lock(&kvm->mmu_lock);
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- spin_unlock(&kvm->mmu_lock);
+ /*
+ * Write protect all pages for dirty logging.
+ * Existing largepage mappings are destroyed here and new ones will
+ * not be created until the end of the logging.
+ */
+ if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
/*
* If memory slot is created, or moved, we need to clear all
* mmio sptes.
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 7872a3330fb5..29043d2048a0 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
bool "Lguest guest support"
select PARAVIRT
depends on X86_32
+ select TTY
select VIRTUALIZATION
select VIRTIO
select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index df4176cdbb32..1cbd89ca5569 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -552,7 +552,8 @@ static void lguest_write_cr3(unsigned long cr3)
current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */
- if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
+ if (cr3 != __pa_symbol(swapper_pg_dir) &&
+ cr3 != __pa_symbol(initial_page_table))
cr3_changed = true;
}
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 156b9c804670..a4512359656a 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -15,11 +15,10 @@
* __get_user_X
*
* Inputs: %[r|e]ax contains the address.
- * The register is modified, but all changes are undone
- * before returning because the C code doesn't know about it.
*
* Outputs: %[r|e]ax is error code (0 or -EFAULT)
* %[r|e]dx contains zero-extended value
+ * %ecx contains the high half for 32-bit __get_user_8
*
*
* These functions should not modify any other registers,
@@ -42,7 +41,7 @@ ENTRY(__get_user_1)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
-1: movzb (%_ASM_AX),%edx
+1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
ret
@@ -72,29 +71,42 @@ ENTRY(__get_user_4)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
-3: mov -3(%_ASM_AX),%edx
+3: movl -3(%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
ret
CFI_ENDPROC
ENDPROC(__get_user_4)
-#ifdef CONFIG_X86_64
ENTRY(__get_user_8)
CFI_STARTPROC
+#ifdef CONFIG_X86_64
add $7,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
- jae bad_get_user
+ jae bad_get_user
ASM_STAC
-4: movq -7(%_ASM_AX),%_ASM_DX
+4: movq -7(%_ASM_AX),%rdx
xor %eax,%eax
ASM_CLAC
ret
+#else
+ add $7,%_ASM_AX
+ jc bad_get_user_8
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user_8
+ ASM_STAC
+4: movl -7(%_ASM_AX),%edx
+5: movl -3(%_ASM_AX),%ecx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+#endif
CFI_ENDPROC
ENDPROC(__get_user_8)
-#endif
+
bad_get_user:
CFI_STARTPROC
@@ -105,9 +117,24 @@ bad_get_user:
CFI_ENDPROC
END(bad_get_user)
+#ifdef CONFIG_X86_32
+bad_get_user_8:
+ CFI_STARTPROC
+ xor %edx,%edx
+ xor %ecx,%ecx
+ mov $(-EFAULT),%_ASM_AX
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+END(bad_get_user_8)
+#endif
+
_ASM_EXTABLE(1b,bad_get_user)
_ASM_EXTABLE(2b,bad_get_user)
_ASM_EXTABLE(3b,bad_get_user)
#ifdef CONFIG_X86_64
_ASM_EXTABLE(4b,bad_get_user)
+#else
+ _ASM_EXTABLE(4b,bad_get_user_8)
+ _ASM_EXTABLE(5b,bad_get_user_8)
#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d7aea41563b3..4903a03ae876 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -16,87 +16,134 @@
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+#include "mm_internal.h"
-int after_bootmem;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
+static unsigned long min_pfn_mapped;
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
+static bool __initdata can_use_brk_pgt = true;
/*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
- * pages. Then find enough contiguous space for those page tables.
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
*/
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
+__ref void *alloc_low_pages(unsigned int num)
{
+ unsigned long pfn;
int i;
- unsigned long puds = 0, pmds = 0, ptes = 0, tables;
- unsigned long start = 0, good_end;
- phys_addr_t base;
- for (i = 0; i < nr_range; i++) {
- unsigned long range, extra;
+ if (after_bootmem) {
+ unsigned int order;
- range = mr[i].end - mr[i].start;
- puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+ __GFP_ZERO, order);
+ }
- if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
- extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
- pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else {
- pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
- }
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret;
+ if (min_pfn_mapped >= max_pfn_mapped)
+ panic("alloc_low_page: ran out of memory");
+ ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ if (!ret)
+ panic("alloc_low_page: can not alloc memory");
+ memblock_reserve(ret, PAGE_SIZE * num);
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+ pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+ }
- if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
- extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
-#endif
- ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else {
- ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
- }
+ for (i = 0; i < num; i++) {
+ void *adr;
+
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
}
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+ return __va(pfn << PAGE_SHIFT);
+}
-#ifdef CONFIG_X86_32
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
- good_end = max_pfn_mapped << PAGE_SHIFT;
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
- base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
- if (!base)
- panic("Cannot find space for the kernel page tables");
+ base = __pa(extend_brk(tables, PAGE_SIZE));
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
- printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
- mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
- (pgt_buf_top << PAGE_SHIFT) - 1);
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+#endif
}
-void __init native_pagetable_reserve(u64 start, u64 end)
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
{
- memblock_reserve(start, end - start);
+ init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
}
#ifdef CONFIG_X86_32
@@ -122,58 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
}
/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
{
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long ret = 0;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
+ int i;
- printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
- start, end - 1);
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
#endif
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
}
+}
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
+ limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
+ pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
@@ -181,66 +221,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
- if (pos == 0)
- end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
else
- end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* big page (2M) range */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#endif
/* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
/* try to merge same page size and continuous */
@@ -257,59 +291,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
nr_range--;
}
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- find_early_table_space(mr, nr_range);
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
-#ifdef CONFIG_X86_32
- early_ioremap_page_table_range_init();
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
- load_cr3(swapper_pg_dir);
-#endif
+ return ret >> PAGE_SHIFT;
+}
- __flush_tlb_all();
+/*
+ * would have hole in the middle or ends, and only ram parts will be mapped.
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
- /*
- * Reserve the kernel pagetable pages we used (pgt_buf_start -
- * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
- * so that they can be reused for other purposes.
- *
- * On native it just means calling memblock_reserve, on Xen it also
- * means marking RW the pagetable pages that we allocated before
- * but that haven't been used.
- *
- * In fact on xen we mark RO the whole range pgt_buf_start -
- * pgt_buf_top, because we have to make sure that when
- * init_memory_mapping reaches the pagetable pages area, it maps
- * RO all the pagetable pages, including the ones that are beyond
- * pgt_buf_end at that time.
- */
- if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
- PFN_PHYS(pgt_buf_end));
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
- if (!after_bootmem)
- early_memtest(start, end);
+ /*
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
+ */
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
+ }
- return ret >> PAGE_SHIFT;
+ return mapped_ram_size;
}
+/* (PUD_SHIFT-PMD_SHIFT)/2 */
+#define STEP_SIZE_SHIFT 5
+void __init init_mem_mapping(void)
+{
+ unsigned long end, real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+ unsigned long new_mapped_ram_size;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /* xen has big range in reserved near end of ram, skip it at first */
+ addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
+ PAGE_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+ while (last_start > ISA_END_ADDRESS) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < ISA_END_ADDRESS)
+ start = ISA_END_ADDRESS;
+ } else
+ start = ISA_END_ADDRESS;
+ new_mapped_ram_size = init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ /* only increase step_size after big range get mapped */
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size <<= STEP_SIZE_SHIFT;
+ mapped_ram_size += new_mapped_ram_size;
+ }
+
+ if (real_end < end)
+ init_range_memory_mapping(real_end, end);
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
@@ -391,6 +535,15 @@ void free_initmem(void)
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
+#ifdef CONFIG_MICROCODE_EARLY
+ /*
+ * Remember, initrd memory may contain microcode or other useful things.
+ * Before we lose initrd mem, we need to find a place to hold them
+ * now that normal virtual memory is enabled.
+ */
+ save_microcode_in_initrd();
+#endif
+
/*
* end could be not aligned, and We can not align that,
* decompresser could be confused by aligned initrd_end
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b843c8..2d19001151d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,25 +53,14 @@
#include <asm/page_types.h>
#include <asm/init.h>
+#include "mm_internal.h"
+
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
-static __init void *alloc_low_page(void)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = __va(pfn * PAGE_SIZE);
- clear_page(adr);
- return adr;
-}
-
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
- else
- pmd_table = (pmd_t *)alloc_low_page();
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
- pte_t *page_table = NULL;
-
- if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
- if (!page_table)
- page_table =
- (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
- } else
- page_table = (pte_t *)alloc_low_page();
+ pte_t *page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
return one_page_table_init(pmd) + pte_idx;
}
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
- unsigned long vaddr, pte_t *lastpte)
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
- && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
- || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
- newpte = alloc_low_page();
+ newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
- pmd, vaddr, pte);
+ pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
@@ -310,6 +321,7 @@ repeat:
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
+ pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
/*
* Remove any mappings which extend past the end of physical
- * memory from the boot time page table:
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
*/
- for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
if (!pmd_present(*pmd))
break;
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
pte_clear(NULL, va, pte);
}
paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
* artificially via the highmem=x boot parameter then create
* it:
*/
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
- after_bootmem = 1;
}
/*
@@ -753,6 +777,8 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
+ after_bootmem = 1;
+
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
@@ -836,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
#endif
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d6eeead43758..474e28f10815 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,82 @@
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | pmd_flag));
+ }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+ int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pud_t *pud;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
}
}
-static __ref void *alloc_low_page(unsigned long *phys)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
- *phys = __pa(adr);
-
- return adr;
- }
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- clear_page(adr);
- *phys = pfn * PAGE_SIZE;
- return adr;
-}
-
-static __ref void *map_low_page(void *virt)
-{
- void *adr;
- unsigned long phys, left;
-
- if (after_bootmem)
- return virt;
-
- phys = __pa(virt);
- left = phys & (PAGE_SIZE - 1);
- adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
- adr = (void *)(((unsigned long)adr) | left);
-
- return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
- if (after_bootmem)
- return;
-
- early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
pgprot_t prot)
{
- unsigned pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i;
pte_t *pte = pte_page + pte_index(addr);
- for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+ for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ next = (addr & PAGE_MASK) + PAGE_SIZE;
if (addr >= end) {
- if (!after_bootmem) {
- for(; i < PTRS_PER_PTE; i++, pte++)
- set_pte(pte, __pte(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
}
/*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
int i = pmd_index(address);
for (; i < PTRS_PER_PMD; i++, address = next) {
- unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
+ next = (address & PMD_MASK) + PMD_SIZE;
if (address >= end) {
- if (!after_bootmem) {
- for (; i < PTRS_PER_PMD; i++, pmd++)
- set_pmd(pmd, __pmd(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
}
- next = (address & PMD_MASK) + PMD_SIZE;
-
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
- pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
last_map_addr = phys_pte_init(pte, address,
end, prot);
- unmap_low_page(pte);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT,
+ pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = next;
continue;
}
- pte = alloc_low_page(&pte_phys);
+ pte = alloc_low_page();
last_map_addr = phys_pte_init(pte, address, end, new_prot);
- unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
int i = pud_index(addr);
for (; i < PTRS_PER_PUD; i++, addr = next) {
- unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- if (addr >= end)
- break;
-
next = (addr & PUD_MASK) + PUD_SIZE;
-
- if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
- set_pud(pud, __pud(0));
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ set_pud(pud, __pud(0));
continue;
}
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
- pmd = map_low_page(pmd_offset(pud, 0));
+ pmd = pmd_offset(pud, 0);
last_map_addr = phys_pmd_init(pmd, addr, end,
page_size_mask, prot);
- unmap_low_page(pmd);
__flush_tlb_all();
continue;
}
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = next;
continue;
}
- pmd = alloc_low_page(&pmd_phys);
+ pmd = alloc_low_page();
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
prot);
- unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
- pud_populate(&init_mm, pud, __va(pmd_phys));
+ pud_populate(&init_mm, pud, pmd);
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
pud_t *pud;
- next = (start + PGDIR_SIZE) & PGDIR_MASK;
- if (next > end)
- next = end;
+ next = (start & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
- pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
last_map_addr = phys_pud_init(pud, __pa(start),
__pa(end), page_size_mask);
- unmap_low_page(pud);
continue;
}
- pud = alloc_low_page(&pud_phys);
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ pud = alloc_low_page();
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
page_size_mask);
- unmap_low_page(pud);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, __va(pud_phys));
+ pgd_populate(&init_mm, pgd, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- last_mapped_pfn = init_memory_mapping(start, start + size);
- if (last_mapped_pfn > max_pfn_mapped)
- max_pfn_mapped = last_mapped_pfn;
+ init_memory_mapping(start, start + size);
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
@@ -682,10 +707,357 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+ struct zone *zone;
+ bool bootmem = false;
+ unsigned long magic;
+ unsigned int nr_pages = 1 << order;
+
+ /* bootmem page has reserved flag */
+ if (PageReserved(page)) {
+ __ClearPageReserved(page);
+ bootmem = true;
+
+ magic = (unsigned long)page->lru.next;
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ while (nr_pages--)
+ put_page_bootmem(page++);
+ } else
+ __free_pages_bootmem(page, order);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+
+ /*
+ * SECTION_INFO pages and MIX_SECTION_INFO pages
+ * are all allocated by bootmem.
+ */
+ if (bootmem) {
+ zone = page_zone(page);
+ zone_span_writelock(zone);
+ zone->present_pages += nr_pages;
+ zone_span_writeunlock(zone);
+ totalram_pages += nr_pages;
+ }
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (pte_val(*pte))
+ return;
+ }
+
+ /* free a pte talbe */
+ free_pagetable(pmd_page(*pmd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (pmd_val(*pmd))
+ return;
+ }
+
+ /* free a pmd talbe */
+ free_pagetable(pud_page(*pud), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (pud_val(*pud))
+ return false;
+ }
+
+ /* free a pud table */
+ free_pagetable(pgd_page(*pgd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pgd_clear(pgd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+ void *page_addr;
+ phys_addr_t phys_addr;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ /*
+ * We mapped [0,1G) memory as identity mapping when
+ * initializing, in arch/x86/kernel/head_64.S. These
+ * pagetables cannot be removed.
+ */
+ phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+ if (phys_addr < (phys_addr_t)0x40000000)
+ return;
+
+ if (IS_ALIGNED(addr, PAGE_SIZE) &&
+ IS_ALIGNED(next, PAGE_SIZE)) {
+ /*
+ * Do not free direct mapping pages since they were
+ * freed when offlining, or simplely not in use.
+ */
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
+ } else {
+ /*
+ * If we are here, we are freeing vmemmap pages since
+ * direct mapped memory ranges to be freed are aligned.
+ *
+ * If we are not removing the whole page, it means
+ * other page structs in this page are being used and
+ * we canot remove them. So fill the unused page_structs
+ * with 0xFD, and remove the page when it is wholly
+ * filled with 0xFD.
+ */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pte_page(*pte));
+ if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+ }
+
+ /* Call free_pte_table() in remove_pmd_table(). */
+ flush_tlb_all();
+ if (direct)
+ update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+ void *page_addr;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pmd_page(*pmd));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PMD_SIZE)) {
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct);
+ free_pte_table(pte_base, pmd);
+ }
+
+ /* Call free_pmd_table() in remove_pud_table(). */
+ if (direct)
+ update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+ void *page_addr;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ if (!direct)
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pud_page(*pud));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PUD_SIZE)) {
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct);
+ free_pmd_table(pmd_base, pud);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ bool pgd_changed = false;
+
+ for (; start < end; start = next) {
+ next = pgd_addr_end(start, end);
+
+ pgd = pgd_offset_k(start);
+ if (!pgd_present(*pgd))
+ continue;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud, start, next, direct);
+ if (free_pud_table(pud, pgd))
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(start, end - 1);
+
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + nr_pages);
+
+ remove_pagetable(start, end, false);
+}
+
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ remove_pagetable(start, end, true);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ kernel_physical_mapping_remove(start, start + size);
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+
+ return ret;
+}
+#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
@@ -698,11 +1070,8 @@ void __init mem_init(void)
reservedpages = 0;
/* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
- totalram_pages = numa_free_all_bootmem();
-#else
+ register_page_bootmem_info();
totalram_pages = free_all_bootmem();
-#endif
absent_pages = absent_pages_in_range(0, max_pfn);
reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -772,12 +1141,11 @@ void set_kernel_text_ro(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long rodata_start =
- ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
- unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
- unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
- unsigned long data_start = (unsigned long) &_sdata;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end = PFN_ALIGN(&_end);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -786,10 +1154,10 @@ void mark_rodata_ro(void)
kernel_set_to_readonly = 1;
/*
- * The rodata section (but not the kernel text!) should also be
- * not-executable.
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
*/
- set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -802,12 +1170,12 @@ void mark_rodata_ro(void)
#endif
free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(text_end)),
- (unsigned long)
- page_address(virt_to_page(rodata_start)));
+ (unsigned long) __va(__pa_symbol(text_end)),
+ (unsigned long) __va(__pa_symbol(rodata_start)));
+
free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(rodata_end)),
- (unsigned long) page_address(virt_to_page(data_start)));
+ (unsigned long) __va(__pa_symbol(rodata_end)),
+ (unsigned long) __va(__pa_symbol(_sdata)));
}
#endif
@@ -988,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
return 0;
}
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned int nr_pages;
+ struct page *page;
+
+ for (; addr < end; addr = next) {
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+ get_page_bootmem(section_nr, pmd_page(*pmd),
+ MIX_SECTION_INFO);
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ continue;
+ get_page_bootmem(section_nr, pte_page(*pte),
+ SECTION_INFO);
+ } else {
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ nr_pages = 1 << (get_order(PMD_SIZE));
+ page = pmd_page(*pmd);
+ while (nr_pages--)
+ get_page_bootmem(section_nr, page++,
+ SECTION_INFO);
+ }
+ }
+}
+#endif
+
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000000..6b563a118891
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 21d02f0d7a2c..ff3633c794c6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,7 +56,7 @@ early_param("numa", numa_setup);
/*
* apicid, cpu, node mappings
*/
-s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-void __cpuinit numa_set_node(int cpu, int node)
+void numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -100,7 +100,7 @@ void __cpuinit numa_set_node(int cpu, int node)
set_cpu_numa_node(cpu, node);
}
-void __cpuinit numa_clear_node(int cpu)
+void numa_clear_node(int cpu)
{
numa_set_node(cpu, NUMA_NO_NODE);
}
@@ -192,7 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
static void __init setup_node_data(int nid, u64 start, u64 end)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- bool remapped = false;
u64 nd_pa;
void *nd;
int tnid;
@@ -204,37 +203,27 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
if (end && (end - start) < NODE_MIN_SIZE)
return;
- /* initialize remap allocator before aligning to ZONE_ALIGN */
- init_alloc_remap(nid, start, end);
-
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
nid, start, end - 1);
/*
- * Allocate node data. Try remap allocator first, node-local
- * memory and then any node. Never allocate in DMA zone.
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
*/
- nd = alloc_remap(nid, nd_size);
- if (nd) {
- nd_pa = __pa(nd);
- remapped = true;
- } else {
- nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
+ nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in any node\n", nd_size);
+ return;
}
+ nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
- nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (!remapped && tnid != nid)
+ if (tnid != nid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
@@ -570,10 +559,12 @@ static int __init numa_init(int (*init_func)(void))
for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);
- nodes_clear(numa_nodes_parsed);
+ /*
+ * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
+ * SRAT was parsed earlier in early_parse_srat().
+ */
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
- memset(&numa_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
numa_reset_distance();
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
extern unsigned long highend_pfn, highstart_pfn;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid. The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function. For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
- return NULL;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- * during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
- int node;
-
- for_each_online_node(node) {
- unsigned long start_va, start_pfn, nr_pages, pfn;
-
- start_va = (unsigned long)node_remap_start_vaddr[node];
- start_pfn = node_remap_start_pfn[node];
- nr_pages = (node_remap_end_vaddr[node] -
- node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
- for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
- unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
- pgd_t *pgd = pgd_base + pgd_index(vaddr);
- pud_t *pud = pud_offset(pgd, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
-
- set_pmd(pmd, pfn_pmd(start_pfn + pfn,
- PAGE_KERNEL_LARGE_EXEC));
-
- printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
- __func__, vaddr, start_pfn + pfn);
- }
- }
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem. As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted. The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal. alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long size, pfn;
- u64 node_pa, remap_pa;
- void *remap_va;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones where
- * memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, start_pfn, end_pfn);
-
- /* calculate the necessary space aligned to large page size */
- size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
- size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
- size = ALIGN(size, LARGE_PAGE_BYTES);
-
- /* allocate node memory and the lowmem remap area */
- node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
- if (!node_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
- size, nid);
- return;
- }
- memblock_reserve(node_pa, size);
-
- remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
- max_low_pfn << PAGE_SHIFT,
- size, LARGE_PAGE_BYTES);
- if (!remap_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
- size, nid);
- memblock_free(node_pa, size);
- return;
- }
- memblock_reserve(remap_pa, size);
- remap_va = phys_to_virt(remap_pa);
-
- /* perform actual remap */
- for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
- set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
- (node_pa >> PAGE_SHIFT) + pfn,
- PAGE_KERNEL_LARGE);
-
- /* initialize remap allocator parameters */
- node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
- node_remap_start_vaddr[nid] = remap_va;
- node_remap_end_vaddr[nid] = remap_va + size;
- node_remap_alloc_vaddr[nid] = remap_va;
-
- printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
- nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
void __init initmem_init(void)
{
x86_numa_init();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1a..9405ffc91502 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
{
x86_numa_init();
}
-
-unsigned long __init numa_free_all_bootmem(void)
-{
- unsigned long pages = 0;
- int i;
-
- for_each_online_node(i)
- pages += free_all_bootmem_node(NODE_DATA(i));
-
- pages += free_low_memory_core_early(MAX_NUMNODES);
-
- return pages;
-}
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
void __init x86_numa_init(void);
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2713be4bca41..091934e1d0d9 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -94,12 +94,12 @@ static inline void split_page_count(int level) { }
static inline unsigned long highmap_start_pfn(void)
{
- return __pa(_text) >> PAGE_SHIFT;
+ return __pa_symbol(_text) >> PAGE_SHIFT;
}
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases.
*/
- if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
- __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+ if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ unsigned long psize;
+ unsigned long pmask;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ offset = virt_addr & ~pmask;
+ phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ return (phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
+/*
* Set the new pmd in all the pgds we know about:
*/
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
@@ -396,7 +427,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
- unsigned int level;
+ enum pg_level level;
if (cpa->force_split)
return 1;
@@ -412,15 +443,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
switch (level) {
case PG_LEVEL_2M:
- psize = PMD_PAGE_SIZE;
- pmask = PMD_PAGE_MASK;
- break;
#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
- psize = PUD_PAGE_SIZE;
- pmask = PUD_PAGE_MASK;
- break;
#endif
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ break;
default:
do_split = -EINVAL;
goto out_unlock;
@@ -514,21 +542,13 @@ out_unlock:
return do_split;
}
-static int split_large_page(pte_t *kpte, unsigned long address)
+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
{
unsigned long pfn, pfninc = 1;
unsigned int i, level;
- pte_t *pbase, *tmp;
+ pte_t *tmp;
pgprot_t ref_prot;
- struct page *base;
-
- if (!debug_pagealloc)
- spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
- spin_lock(&cpa_lock);
- if (!base)
- return -ENOMEM;
+ struct page *base = virt_to_page(pbase);
spin_lock(&pgd_lock);
/*
@@ -536,10 +556,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* up for us already:
*/
tmp = lookup_address(address, &level);
- if (tmp != kpte)
- goto out_unlock;
+ if (tmp != kpte) {
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
- pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
/*
@@ -583,16 +604,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
- if (address >= (unsigned long)__va(0) &&
- address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+ PFN_DOWN(__pa(address)) + 1))
split_page_count(level);
-#ifdef CONFIG_X86_64
- if (address >= (unsigned long)__va(1UL<<32) &&
- address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
- split_page_count(level);
-#endif
-
/*
* Install the new, split up pagetable.
*
@@ -611,17 +626,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* going on.
*/
__flush_tlb_all();
+ spin_unlock(&pgd_lock);
+
+ return 0;
+}
- base = NULL;
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+ pte_t *pbase;
+ struct page *base;
-out_unlock:
- /*
- * If we dropped out via the lookup_address check under
- * pgd_lock then stick the page back into the pool:
- */
- if (base)
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+
+ pbase = (pte_t *)page_address(base);
+ if (__split_large_page(kpte, address, pbase))
__free_page(base);
- spin_unlock(&pgd_lock);
return 0;
}
@@ -773,13 +798,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
unsigned long vaddr;
int ret;
- if (cpa->pfn >= max_pfn_mapped)
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
return 0;
-#ifdef CONFIG_X86_64
- if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
- return 0;
-#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 0eb572eda406..2610bd93c896 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (base >= __pa(high_memory))
+ if (base > __pa(high_memory-1))
return 0;
- id_sz = (__pa(high_memory) < base + size) ?
+ id_sz = (__pa(high_memory-1) <= base + size) ?
__pa(high_memory) - base :
size;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e27fbf887f3b..193350b51f90 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -334,7 +334,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*pmdp = entry;
pmd_update_defer(vma->vm_mm, address, pmdp);
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
}
return changed;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b4..e666cbbb9261 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -8,33 +9,54 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
}
+
return x;
}
EXPORT_SYMBOL(__phys_addr);
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
bool __virt_addr_valid(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
return false;
- x += phys_base;
} else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
return false;
}
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
+ unsigned long phys_addr = x - PAGE_OFFSET;
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index cdd0da9dd530..79836d01f789 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
static inline int save_add_info(void) {return 0;}
#endif
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init
+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
+{
+ int overlap, i;
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = PFN_DOWN(start);
+ end_pfn = PFN_UP(end);
+
+ /*
+ * For movablemem_map=acpi:
+ *
+ * SRAT: |_____| |_____| |_________| |_________| ......
+ * node id: 0 1 1 2
+ * hotpluggable: n y y n
+ * movablemem_map: |_____| |_________|
+ *
+ * Using movablemem_map, we can prevent memblock from allocating memory
+ * on ZONE_MOVABLE at boot time.
+ *
+ * Before parsing SRAT, memblock has already reserve some memory ranges
+ * for other purposes, such as for kernel image. We cannot prevent
+ * kernel from using these memory, so we need to exclude these memory
+ * even if it is hotpluggable.
+ * Furthermore, to ensure the kernel has enough memory to boot, we make
+ * all the memory on the node which the kernel resides in
+ * un-hotpluggable.
+ */
+ if (hotpluggable && movablemem_map.acpi) {
+ /* Exclude ranges reserved by memblock. */
+ struct memblock_type *rgn = &memblock.reserved;
+
+ for (i = 0; i < rgn->cnt; i++) {
+ if (end <= rgn->regions[i].base ||
+ start >= rgn->regions[i].base +
+ rgn->regions[i].size)
+ continue;
+
+ /*
+ * If the memory range overlaps the memory reserved by
+ * memblock, then the kernel resides in this node.
+ */
+ node_set(node, movablemem_map.numa_nodes_kernel);
+
+ goto out;
+ }
+
+ /*
+ * If the kernel resides in this node, then the whole node
+ * should not be hotpluggable.
+ */
+ if (node_isset(node, movablemem_map.numa_nodes_kernel))
+ goto out;
+
+ insert_movablemem_map(start_pfn, end_pfn);
+
+ /*
+ * numa_nodes_hotplug nodemask represents which nodes are put
+ * into movablemem_map.map[].
+ */
+ node_set(node, movablemem_map.numa_nodes_hotplug);
+ goto out;
+ }
+
+ /*
+ * For movablemem_map=nn[KMG]@ss[KMG]:
+ *
+ * SRAT: |_____| |_____| |_________| |_________| ......
+ * node id: 0 1 1 2
+ * user specified: |__| |___|
+ * movablemem_map: |___| |_________| |______| ......
+ *
+ * Using movablemem_map, we can prevent memblock from allocating memory
+ * on ZONE_MOVABLE at boot time.
+ *
+ * NOTE: In this case, SRAT info will be ingored.
+ */
+ overlap = movablemem_map_overlap(start_pfn, end_pfn);
+ if (overlap >= 0) {
+ /*
+ * If part of this range is in movablemem_map, we need to
+ * add the range after it to extend the range to the end
+ * of the node, because from the min address specified to
+ * the end of the node will be ZONE_MOVABLE.
+ */
+ start_pfn = max(start_pfn,
+ movablemem_map.map[overlap].start_pfn);
+ insert_movablemem_map(start_pfn, end_pfn);
+
+ /*
+ * Set the nodemask, so that if the address range on one node
+ * is not continuse, we can add the subsequent ranges on the
+ * same node into movablemem_map.
+ */
+ node_set(node, movablemem_map.numa_nodes_hotplug);
+ } else {
+ if (node_isset(node, movablemem_map.numa_nodes_hotplug))
+ /*
+ * Insert the range if we already have movable ranges
+ * on the same node.
+ */
+ insert_movablemem_map(start_pfn, end_pfn);
+ }
+out:
+ return;
+}
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void
+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
int __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
u64 start, end;
+ u32 hotpluggable;
int node, pxm;
if (srat_disabled())
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+ hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+ if (hotpluggable && !save_add_info())
goto out_err;
start = ma->base_address;
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+ printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
node, pxm,
- (unsigned long long) start, (unsigned long long) end - 1);
+ (unsigned long long) start, (unsigned long long) end - 1,
+ hotpluggable ? "Hot Pluggable": "");
+
+ handle_movablemem(node, start, end, hotpluggable);
return 0;
out_err_bad_srat:
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d11a47099d33..3cbe45381bbb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,6 +1,6 @@
/* bpf_jit_comp.c : BPF JIT compiler
*
- * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
+ * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -124,6 +124,26 @@ static inline void bpf_flush_icache(void *start, void *end)
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
+/* Helper to find the offset of pkt_type in sk_buff
+ * We want to make sure its still a 3bit field starting at a byte boundary.
+ */
+#define PKT_TYPE_MAX 7
+static int pkt_type_offset(void)
+{
+ struct sk_buff skb_probe = {
+ .pkt_type = ~0,
+ };
+ char *ct = (char *)&skb_probe;
+ unsigned int off;
+
+ for (off = 0; off < sizeof(struct sk_buff); off++) {
+ if (ct[off] == PKT_TYPE_MAX)
+ return off;
+ }
+ pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n");
+ return -1;
+}
+
void bpf_jit_compile(struct sk_filter *fp)
{
u8 temp[64];
@@ -216,6 +236,7 @@ void bpf_jit_compile(struct sk_filter *fp)
case BPF_S_ANC_VLAN_TAG:
case BPF_S_ANC_VLAN_TAG_PRESENT:
case BPF_S_ANC_QUEUE:
+ case BPF_S_ANC_PKTTYPE:
case BPF_S_LD_W_ABS:
case BPF_S_LD_H_ABS:
case BPF_S_LD_B_ABS:
@@ -536,6 +557,23 @@ void bpf_jit_compile(struct sk_filter *fp)
EMIT3(0x83, 0xe0, 0x01); /* and $0x1,%eax */
}
break;
+ case BPF_S_ANC_PKTTYPE:
+ {
+ int off = pkt_type_offset();
+
+ if (off < 0)
+ goto out;
+ if (is_imm8(off)) {
+ /* movzbl off8(%rdi),%eax */
+ EMIT4(0x0f, 0xb6, 0x47, off);
+ } else {
+ /* movbl off32(%rdi),%eax */
+ EMIT3(0x0f, 0xb6, 0x87);
+ EMIT(off, 4);
+ }
+ EMIT3(0x83, 0xe0, PKT_TYPE_MAX); /* and $0x7,%eax */
+ break;
+ }
case BPF_S_LD_W_ABS:
func = CHOOSE_LOAD_FUNC(K, sk_load_word);
common_load: seen |= SEEN_DATAREF;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 53ea60458e01..3e724256dbee 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -521,6 +521,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
sd = &info->sd;
sd->domain = domain;
sd->node = node;
+ sd->acpi = device->handle;
/*
* Maybe the desired pci bus has been already scanned. In such case
* it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -592,6 +593,14 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
return bus;
}
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+ struct pci_sysdata *sd = bridge->bus->sysdata;
+
+ ACPI_HANDLE_SET(&bridge->dev, sd->acpi);
+ return 0;
+}
+
int __init pci_acpi_init(void)
{
struct pci_dev *dev = NULL;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ccd0ab3ab899..901177d75ff5 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -34,7 +34,6 @@ int noioapicreroute = 1;
#endif
int pcibios_last_bus = -1;
unsigned long pirq_table_addr;
-struct pci_bus *pci_root_bus;
const struct pci_raw_ops *__read_mostly raw_pci_ops;
const struct pci_raw_ops *__read_mostly raw_pci_ext_ops;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index dd8ca6f7223b..94919e307f8e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -51,6 +51,7 @@ struct pcibios_fwaddrmap {
static LIST_HEAD(pcibios_fwaddrmappings);
static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock);
+static bool pcibios_fw_addr_done;
/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */
static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
@@ -72,6 +73,9 @@ pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr)
unsigned long flags;
struct pcibios_fwaddrmap *map;
+ if (pcibios_fw_addr_done)
+ return;
+
spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
map = pcibios_fwaddrmap_lookup(dev);
if (!map) {
@@ -97,6 +101,9 @@ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
struct pcibios_fwaddrmap *map;
resource_size_t fw_addr = 0;
+ if (pcibios_fw_addr_done)
+ return 0;
+
spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
map = pcibios_fwaddrmap_lookup(dev);
if (map)
@@ -106,7 +113,7 @@ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
return fw_addr;
}
-static void pcibios_fw_addr_list_del(void)
+static void __init pcibios_fw_addr_list_del(void)
{
unsigned long flags;
struct pcibios_fwaddrmap *entry, *next;
@@ -118,6 +125,7 @@ static void pcibios_fw_addr_list_del(void)
kfree(entry);
}
spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+ pcibios_fw_addr_done = true;
}
static int
@@ -193,46 +201,46 @@ EXPORT_SYMBOL(pcibios_align_resource);
* as well.
*/
-static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
+static void pcibios_allocate_bridge_resources(struct pci_dev *dev)
{
- struct pci_bus *bus;
- struct pci_dev *dev;
int idx;
struct resource *r;
- /* Depth-First Search on bus tree */
- list_for_each_entry(bus, bus_list, node) {
- if ((dev = bus->self)) {
- for (idx = PCI_BRIDGE_RESOURCES;
- idx < PCI_NUM_RESOURCES; idx++) {
- r = &dev->resource[idx];
- if (!r->flags)
- continue;
- if (!r->start ||
- pci_claim_resource(dev, idx) < 0) {
- /*
- * Something is wrong with the region.
- * Invalidate the resource to prevent
- * child resource allocations in this
- * range.
- */
- r->start = r->end = 0;
- r->flags = 0;
- }
- }
+ for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
+ r = &dev->resource[idx];
+ if (!r->flags)
+ continue;
+ if (!r->start || pci_claim_resource(dev, idx) < 0) {
+ /*
+ * Something is wrong with the region.
+ * Invalidate the resource to prevent
+ * child resource allocations in this
+ * range.
+ */
+ r->start = r->end = 0;
+ r->flags = 0;
}
- pcibios_allocate_bus_resources(&bus->children);
}
}
+static void pcibios_allocate_bus_resources(struct pci_bus *bus)
+{
+ struct pci_bus *child;
+
+ /* Depth-First Search on bus tree */
+ if (bus->self)
+ pcibios_allocate_bridge_resources(bus->self);
+ list_for_each_entry(child, &bus->children, node)
+ pcibios_allocate_bus_resources(child);
+}
+
struct pci_check_idx_range {
int start;
int end;
};
-static void __init pcibios_allocate_resources(int pass)
+static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass)
{
- struct pci_dev *dev = NULL;
int idx, disabled, i;
u16 command;
struct resource *r;
@@ -244,14 +252,13 @@ static void __init pcibios_allocate_resources(int pass)
#endif
};
- for_each_pci_dev(dev) {
- pci_read_config_word(dev, PCI_COMMAND, &command);
- for (i = 0; i < ARRAY_SIZE(idx_range); i++)
+ pci_read_config_word(dev, PCI_COMMAND, &command);
+ for (i = 0; i < ARRAY_SIZE(idx_range); i++)
for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
r = &dev->resource[idx];
- if (r->parent) /* Already allocated */
+ if (r->parent) /* Already allocated */
continue;
- if (!r->start) /* Address not assigned at all */
+ if (!r->start) /* Address not assigned at all */
continue;
if (r->flags & IORESOURCE_IO)
disabled = !(command & PCI_COMMAND_IO);
@@ -270,44 +277,74 @@ static void __init pcibios_allocate_resources(int pass)
}
}
}
- if (!pass) {
- r = &dev->resource[PCI_ROM_RESOURCE];
- if (r->flags & IORESOURCE_ROM_ENABLE) {
- /* Turn the ROM off, leave the resource region,
- * but keep it unregistered. */
- u32 reg;
- dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
- r->flags &= ~IORESOURCE_ROM_ENABLE;
- pci_read_config_dword(dev,
- dev->rom_base_reg, &reg);
- pci_write_config_dword(dev, dev->rom_base_reg,
+ if (!pass) {
+ r = &dev->resource[PCI_ROM_RESOURCE];
+ if (r->flags & IORESOURCE_ROM_ENABLE) {
+ /* Turn the ROM off, leave the resource region,
+ * but keep it unregistered. */
+ u32 reg;
+ dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
+ r->flags &= ~IORESOURCE_ROM_ENABLE;
+ pci_read_config_dword(dev, dev->rom_base_reg, &reg);
+ pci_write_config_dword(dev, dev->rom_base_reg,
reg & ~PCI_ROM_ADDRESS_ENABLE);
- }
}
}
}
-static int __init pcibios_assign_resources(void)
+static void pcibios_allocate_resources(struct pci_bus *bus, int pass)
+{
+ struct pci_dev *dev;
+ struct pci_bus *child;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ pcibios_allocate_dev_resources(dev, pass);
+
+ child = dev->subordinate;
+ if (child)
+ pcibios_allocate_resources(child, pass);
+ }
+}
+
+static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev)
{
- struct pci_dev *dev = NULL;
struct resource *r;
- if (!(pci_probe & PCI_ASSIGN_ROMS)) {
- /*
- * Try to use BIOS settings for ROMs, otherwise let
- * pci_assign_unassigned_resources() allocate the new
- * addresses.
- */
- for_each_pci_dev(dev) {
- r = &dev->resource[PCI_ROM_RESOURCE];
- if (!r->flags || !r->start)
- continue;
- if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
- r->end -= r->start;
- r->start = 0;
- }
- }
+ /*
+ * Try to use BIOS settings for ROMs, otherwise let
+ * pci_assign_unassigned_resources() allocate the new
+ * addresses.
+ */
+ r = &dev->resource[PCI_ROM_RESOURCE];
+ if (!r->flags || !r->start)
+ return;
+
+ if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
+ r->end -= r->start;
+ r->start = 0;
}
+}
+static void pcibios_allocate_rom_resources(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+ struct pci_bus *child;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ pcibios_allocate_dev_rom_resource(dev);
+
+ child = dev->subordinate;
+ if (child)
+ pcibios_allocate_rom_resources(child);
+ }
+}
+
+static int __init pcibios_assign_resources(void)
+{
+ struct pci_bus *bus;
+
+ if (!(pci_probe & PCI_ASSIGN_ROMS))
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_rom_resources(bus);
pci_assign_unassigned_resources();
pcibios_fw_addr_list_del();
@@ -315,12 +352,32 @@ static int __init pcibios_assign_resources(void)
return 0;
}
+void pcibios_resource_survey_bus(struct pci_bus *bus)
+{
+ dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n");
+
+ pcibios_allocate_bus_resources(bus);
+
+ pcibios_allocate_resources(bus, 0);
+ pcibios_allocate_resources(bus, 1);
+
+ if (!(pci_probe & PCI_ASSIGN_ROMS))
+ pcibios_allocate_rom_resources(bus);
+}
+
void __init pcibios_resource_survey(void)
{
+ struct pci_bus *bus;
+
DBG("PCI: Allocating resources\n");
- pcibios_allocate_bus_resources(&pci_root_buses);
- pcibios_allocate_resources(0);
- pcibios_allocate_resources(1);
+
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_bus_resources(bus);
+
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_resources(bus, 0);
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_resources(bus, 1);
e820_reserve_resources_late();
/*
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 4a2ab9cb3659..4db96fb1c232 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -30,7 +30,7 @@ int __init pci_legacy_init(void)
}
printk("PCI: Probing PCI hardware\n");
- pci_root_bus = pcibios_scan_root(0);
+ pcibios_scan_root(0);
return 0;
}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index b96b14c250b6..72c229f9ebcf 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -152,7 +152,7 @@ int __init pci_numaq_init(void)
raw_pci_ops = &pci_direct_conf1_mq;
- pci_root_bus = pcibios_scan_root(0);
+ pcibios_scan_root(0);
if (num_online_nodes() > 1)
for_each_online_node(quad) {
if (quad == 0)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e2cd38ffc406..2f81db40d7ca 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -417,8 +417,8 @@ void __init efi_reserve_boot_services(void)
* - Not within any part of the kernel
* - Not the bios reserved area
*/
- if ((start+size >= virt_to_phys(_text)
- && start <= virt_to_phys(_end)) ||
+ if ((start+size >= __pa_symbol(_text)
+ && start <= __pa_symbol(_end)) ||
!e820_all_mapped(start, start+size, E820_RAM) ||
memblock_is_region_reserved(start, size)) {
/* Could not reserve, skip it */
@@ -844,7 +844,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
unsigned long size;
- u64 end, systab, end_pfn;
+ u64 end, systab, start_pfn, end_pfn;
void *p, *va, *new_memmap = NULL;
int count = 0;
@@ -897,10 +897,9 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
+ start_pfn = PFN_DOWN(md->phys_addr);
end_pfn = PFN_UP(end);
- if (end_pfn <= max_low_pfn_mapped
- || (end_pfn > (1UL << (32 - PAGE_SHIFT))
- && end_pfn <= max_pfn_mapped)) {
+ if (pfn_range_is_mapped(start_pfn, end_pfn)) {
va = __va(md->phys_addr);
if (!(md->attribute & EFI_MEMORY_WB))
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 2fdca25905ae..fef7d0ba7e3a 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -195,7 +195,7 @@ err_sysfs:
return r;
}
-static int xo15_sci_remove(struct acpi_device *device, int type)
+static int xo15_sci_remove(struct acpi_device *device)
{
acpi_disable_gpe(NULL, xo15_sci_gpe);
acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 74202c1910cd..7d28c885d238 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
}
}
- resume_map_numa_kva(pgd_base);
-
return 0;
}
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 460f314d13e5..a0fde91c16cf 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,8 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/suspend.h>
+
+#include <asm/init.h>
#include <asm/proto.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
void *relocated_restore_code;
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void *alloc_pgt_page(void *context)
{
- long i, j;
-
- i = pud_index(address);
- pud = pud + i;
- for (; i < PTRS_PER_PUD; pud++, i++) {
- unsigned long paddr;
- pmd_t *pmd;
-
- paddr = address + i*PUD_SIZE;
- if (paddr >= end)
- break;
-
- pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
- if (!pmd)
- return -ENOMEM;
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
- unsigned long pe;
-
- if (paddr >= end)
- break;
- pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
- pe &= __supported_pte_mask;
- set_pmd(pmd, __pmd(pe));
- }
- }
- return 0;
+ return (void *)get_safe_page(GFP_ATOMIC);
}
static int set_up_temporary_mappings(void)
{
- unsigned long start, end, next;
- int error;
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernel_mapping = true,
+ };
+ unsigned long mstart, mend;
+ int result;
+ int i;
temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
if (!temp_level4_pgt)
@@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
init_level4_pgt[pgd_index(__START_KERNEL_map)]);
/* Set up the direct mapping from scratch */
- start = (unsigned long)pfn_to_kaddr(0);
- end = (unsigned long)pfn_to_kaddr(max_pfn);
-
- for (; start < end; start = next) {
- pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
- if (!pud)
- return -ENOMEM;
- next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
- if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
- return error;
- set_pgd(temp_level4_pgt + pgd_index(start),
- mk_kernel_pgd(__pa(pud)));
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info, temp_level4_pgt,
+ mstart, mend);
+
+ if (result)
+ return result;
}
+
return 0;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cbca565af5bd..a44f457e70a1 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -8,9 +8,26 @@
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
-void __init setup_real_mode(void)
+void __init reserve_real_mode(void)
{
phys_addr_t mem;
+ unsigned char *base;
+ size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+
+ /* Has to be under 1M so we can execute real-mode AP code. */
+ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+ if (!mem)
+ panic("Cannot allocate trampoline\n");
+
+ base = __va(mem);
+ memblock_reserve(mem, size);
+ real_mode_header = (struct real_mode_header *) base;
+ printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+ base, (unsigned long long)mem, size);
+}
+
+void __init setup_real_mode(void)
+{
u16 real_mode_seg;
u32 *rel;
u32 count;
@@ -25,16 +42,7 @@ void __init setup_real_mode(void)
u64 efer;
#endif
- /* Has to be in very low memory so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem)
- panic("Cannot allocate trampoline\n");
-
- base = __va(mem);
- memblock_reserve(mem, size);
- real_mode_header = (struct real_mode_header *) base;
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- base, (unsigned long long)mem, size);
+ base = (unsigned char *)real_mode_header;
memcpy(base, real_mode_blob, size);
@@ -62,9 +70,9 @@ void __init setup_real_mode(void)
__va(real_mode_header->trampoline_header);
#ifdef CONFIG_X86_32
- trampoline_header->start = __pa(startup_32_smp);
+ trampoline_header->start = __pa_symbol(startup_32_smp);
trampoline_header->gdt_limit = __BOOT_DS + 7;
- trampoline_header->gdt_base = __pa(boot_gdt);
+ trampoline_header->gdt_base = __pa_symbol(boot_gdt);
#else
/*
* Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
@@ -78,16 +86,18 @@ void __init setup_real_mode(void)
*trampoline_cr4_features = read_cr4();
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
- trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
- trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+ trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
+ trampoline_pgd[511] = init_level4_pgt[511].pgd;
#endif
}
/*
- * set_real_mode_permissions() gets called very early, to guarantee the
- * availability of low memory. This is before the proper kernel page
+ * reserve_real_mode() gets called very early, to guarantee the
+ * availability of low memory. This is before the proper kernel page
* tables are set up, so we cannot set page permissions in that
- * function. Thus, we use an arch_initcall instead.
+ * function. Also trampoline code will be executed by APs so we
+ * need to mark it executable at do_pre_smp_initcalls() at least,
+ * thus run it as a early_initcall().
*/
static int __init set_real_mode_permissions(void)
{
@@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void)
return 0;
}
-
-arch_initcall(set_real_mode_permissions);
+early_initcall(set_real_mode_permissions);
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 28e3fa9056ea..f2fe78ff22cc 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -73,12 +73,12 @@
64 i386 getppid sys_getppid
65 i386 getpgrp sys_getpgrp
66 i386 setsid sys_setsid
-67 i386 sigaction sys_sigaction sys32_sigaction
+67 i386 sigaction sys_sigaction compat_sys_sigaction
68 i386 sgetmask sys_sgetmask
69 i386 ssetmask sys_ssetmask
70 i386 setreuid sys_setreuid16
71 i386 setregid sys_setregid16
-72 i386 sigsuspend sys_sigsuspend sys32_sigsuspend
+72 i386 sigsuspend sys_sigsuspend sys_sigsuspend
73 i386 sigpending sys_sigpending compat_sys_sigpending
74 i386 sethostname sys_sethostname
75 i386 setrlimit sys_setrlimit compat_sys_setrlimit
@@ -116,16 +116,16 @@
107 i386 lstat sys_newlstat compat_sys_newlstat
108 i386 fstat sys_newfstat compat_sys_newfstat
109 i386 olduname sys_uname
-110 i386 iopl ptregs_iopl stub32_iopl
+110 i386 iopl sys_iopl
111 i386 vhangup sys_vhangup
112 i386 idle
-113 i386 vm86old ptregs_vm86old sys32_vm86_warning
+113 i386 vm86old sys_vm86old sys32_vm86_warning
114 i386 wait4 sys_wait4 compat_sys_wait4
115 i386 swapoff sys_swapoff
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
117 i386 ipc sys_ipc sys32_ipc
118 i386 fsync sys_fsync
-119 i386 sigreturn ptregs_sigreturn stub32_sigreturn
+119 i386 sigreturn sys_sigreturn stub32_sigreturn
120 i386 clone sys_clone stub32_clone
121 i386 setdomainname sys_setdomainname
122 i386 uname sys_newuname
@@ -167,24 +167,24 @@
158 i386 sched_yield sys_sched_yield
159 i386 sched_get_priority_max sys_sched_get_priority_max
160 i386 sched_get_priority_min sys_sched_get_priority_min
-161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval
+161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval
162 i386 nanosleep sys_nanosleep compat_sys_nanosleep
163 i386 mremap sys_mremap
164 i386 setresuid sys_setresuid16
165 i386 getresuid sys_getresuid16
-166 i386 vm86 ptregs_vm86 sys32_vm86_warning
+166 i386 vm86 sys_vm86 sys32_vm86_warning
167 i386 query_module
168 i386 poll sys_poll
169 i386 nfsservctl
170 i386 setresgid sys_setresgid16
171 i386 getresgid sys_getresgid16
172 i386 prctl sys_prctl
-173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn
-174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction
+173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn
+174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
175 i386 rt_sigprocmask sys_rt_sigprocmask
-176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending
+176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait
-178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo
+178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo
179 i386 rt_sigsuspend sys_rt_sigsuspend
180 i386 pread64 sys_pread64 sys32_pread
181 i386 pwrite64 sys_pwrite64 sys32_pwrite
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dc97328bd90a..38ae65dfd14f 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -325,7 +325,7 @@
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
#
-512 x32 rt_sigaction sys32_rt_sigaction
+512 x32 rt_sigaction compat_sys_rt_sigaction
513 x32 rt_sigreturn stub_x32_rt_sigreturn
514 x32 ioctl compat_sys_ioctl
515 x32 readv compat_sys_readv
@@ -335,9 +335,9 @@
519 x32 recvmsg compat_sys_recvmsg
520 x32 execve stub_x32_execve
521 x32 ptrace compat_sys_ptrace
-522 x32 rt_sigpending sys32_rt_sigpending
+522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
-524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo
+524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo
525 x32 sigaltstack compat_sys_sigaltstack
526 x32 timer_create compat_sys_timer_create
527 x32 mq_notify compat_sys_mq_notify
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 53c90fd412d1..14ef8d1dbc33 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -13,7 +13,6 @@ endmenu
config UML_X86
def_bool y
select GENERIC_FIND_FIRST_BIT
- select GENERIC_SIGALTSTACK
config 64BIT
bool "64-bit kernel" if SUBARCH = "x86"
@@ -25,6 +24,8 @@ config X86_32
select ARCH_WANT_IPC_PARSE_VERSION
select MODULES_USE_ELF_REL
select CLONE_BACKWARDS
+ select OLD_SIGSUSPEND3
+ select OLD_SIGACTION
config X86_64
def_bool 64BIT
@@ -37,9 +38,8 @@ config RWSEM_GENERIC_SPINLOCK
def_bool !RWSEM_XCHGADD_ALGORITHM
config 3_LEVEL_PGTABLES
- bool "Three-level pagetables (EXPERIMENTAL)" if !64BIT
+ bool "Three-level pagetables" if !64BIT
default 64BIT
- depends on EXPERIMENTAL
help
Three-level pagetables will let UML have more than 4G of physical
memory. All the memory that can't be mapped directly will be treated
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 5d065b2222d3..eafa324eb7a5 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -10,7 +10,7 @@ endif
obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
- stub_$(BITS).o stub_segv.o syscalls_$(BITS).o \
+ stub_$(BITS).o stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
mem_$(BITS).o subarch.o os-$(OS)/
@@ -25,7 +25,7 @@ subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o
else
-obj-y += vdso/
+obj-y += syscalls_64.o vdso/
subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \
../lib/rwsem.o
diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h
index 8436079be914..68fd2cf526fd 100644
--- a/arch/x86/um/shared/sysdep/syscalls_32.h
+++ b/arch/x86/um/shared/sysdep/syscalls_32.h
@@ -8,11 +8,6 @@
typedef long syscall_handler_t(struct pt_regs);
-/* Not declared on x86, incompatible declarations on x86_64, so these have
- * to go here rather than in sys_call_table.c
- */
-extern syscall_handler_t sys_rt_sigaction;
-
extern syscall_handler_t *sys_call_table[];
#define EXECUTE_SYSCALL(syscall, regs) \
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 71cef48ea5cd..ae7319db18ee 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -464,7 +464,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return 0;
}
-long sys_sigreturn(struct pt_regs *regs)
+long sys_sigreturn(void)
{
unsigned long sp = PT_REGS_SP(&current->thread.regs);
struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
@@ -577,7 +577,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
}
#endif
-long sys_rt_sigreturn(struct pt_regs *regs)
+long sys_rt_sigreturn(void)
{
unsigned long sp = PT_REGS_SP(&current->thread.regs);
struct rt_sigframe __user *frame =
@@ -601,14 +601,3 @@ long sys_rt_sigreturn(struct pt_regs *regs)
force_sig(SIGSEGV, current);
return 0;
}
-
-#ifdef CONFIG_X86_32
-long ptregs_sigreturn(void)
-{
- return sys_sigreturn(NULL);
-}
-long ptregs_rt_sigreturn(void)
-{
- return sys_rt_sigreturn(NULL);
-}
-#endif
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index a0c3b0d1a122..531d4269e2e3 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -24,10 +24,6 @@
#define old_mmap sys_old_mmap
-#define ptregs_iopl sys_iopl
-#define ptregs_vm86old sys_vm86old
-#define ptregs_vm86 sys_vm86
-
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
#include <asm/syscalls_32.h>
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
deleted file mode 100644
index e8bcea99acdb..000000000000
--- a/arch/x86/um/syscalls_32.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com)
- * Licensed under the GPL
- */
-
-#include <linux/syscalls.h>
-#include <sysdep/syscalls.h>
-
-long sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
-
- if (act) {
- old_sigset_t mask;
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
- __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
- __get_user(mask, &act->sa_mask))
- return -EFAULT;
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
- return -EFAULT;
- }
-
- return ret;
-}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 01de35c77221..e8e34938c57d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
-static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-{
- /* reserve the range used */
- native_pagetable_reserve(start, end);
-
- /* set as RW the rest */
- printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
- PFN_PHYS(pgt_buf_top));
- while (end < PFN_PHYS(pgt_buf_top)) {
- make_lowmem_page_readwrite(__va(end));
- end += PAGE_SIZE;
- }
-}
-
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1422,7 +1408,6 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
xen_mc_callback(set_current_cr3, (void *)cr3);
}
}
-
static void xen_write_cr3(unsigned long cr3)
{
BUG_ON(preemptible());
@@ -1448,6 +1433,45 @@ static void xen_write_cr3(unsigned long cr3)
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
+#ifdef CONFIG_X86_64
+/*
+ * At the start of the day - when Xen launches a guest, it has already
+ * built pagetables for the guest. We diligently look over them
+ * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * init_level4_pgt and its friends. Then when we are happy we load
+ * the new init_level4_pgt - and continue on.
+ *
+ * The generic code starts (start_kernel) and 'init_mem_mapping' sets
+ * up the rest of the pagetables. When it has completed it loads the cr3.
+ * N.B. that baremetal would start at 'start_kernel' (and the early
+ * #PF handler would create bootstrap pagetables) - so we are running
+ * with the same assumptions as what to do when write_cr3 is executed
+ * at this point.
+ *
+ * Since there are no user-page tables at all, we have two variants
+ * of xen_write_cr3 - the early bootup (this one), and the late one
+ * (xen_write_cr3). The reason we have to do that is that in 64-bit
+ * the Linux kernel and user-space are both in ring 3 while the
+ * hypervisor is in ring 0.
+ */
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
+#endif
+
static int xen_pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd = mm->pgd;
@@ -1503,19 +1527,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
- unsigned long pfn = pte_pfn(pte);
-
- /*
- * If the new pfn is within the range of the newly allocated
- * kernel pagetable, and it isn't being mapped into an
- * early_ioremap fixmap slot as a freshly allocated page, make sure
- * it is RO.
- */
- if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
- (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
- pte = pte_wrprotect(pte);
-
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -2129,11 +2140,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.write_cr2 = xen_write_cr2,
.read_cr3 = xen_read_cr3,
-#ifdef CONFIG_X86_32
.write_cr3 = xen_write_cr3_init,
-#else
- .write_cr3 = xen_write_cr3,
-#endif
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
@@ -2197,7 +2204,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
void __init xen_init_mmu_ops(void)
{
- x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_init = xen_pagetable_init;
pv_mmu_ops = xen_mmu_ops;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8971a26d21ab..94eac5c85cdc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -556,12 +556,9 @@ void __init xen_arch_setup(void)
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
/* Set up idle, making sure it calls safe_halt() pvop */
-#ifdef CONFIG_X86_32
- boot_cpu_data.hlt_works_ok = 1;
-#endif
disable_cpuidle();
disable_cpufreq();
- WARN_ON(set_pm_idle_to_default());
+ WARN_ON(xen_set_default_idle());
fiddle_vdso();
#ifdef CONFIG_NUMA
numa_off = 1;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 34bc4cee8887..09ea61d2e02f 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -300,8 +300,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
- ctxt->user_regs.ds = __USER_DS;
- ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
@@ -310,35 +308,41 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
- ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
- xen_copy_trap_info(ctxt->trap_ctxt);
+ {
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
- ctxt->ldt_ents = 0;
+ xen_copy_trap_info(ctxt->trap_ctxt);
- BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+ ctxt->ldt_ents = 0;
- gdt_mfn = arbitrary_virt_to_mfn(gdt);
- make_lowmem_page_readonly(gdt);
- make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
- ctxt->gdt_frames[0] = gdt_mfn;
- ctxt->gdt_ents = GDT_ENTRIES;
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
+ make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
- ctxt->user_regs.cs = __KERNEL_CS;
- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
- ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
#endif
- ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+ ctxt->event_callback_eip =
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+ }
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 83e866d714ce..f7a080ef0354 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -328,7 +328,6 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
if (per_cpu(lock_spinners, cpu) == xl) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
- break;
}
}
}