diff options
Diffstat (limited to 'arch/x86')
201 files changed, 4238 insertions, 3812 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c2fb8a87dccb..faff6934c05a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -235,12 +235,10 @@ config ARCH_WANT_GENERAL_HUGETLB def_bool y config ZONE_DMA32 - bool - default X86_64 + def_bool y if X86_64 config AUDIT_ARCH - bool - default X86_64 + def_bool y if X86_64 config ARCH_SUPPORTS_OPTIMIZED_INLINING def_bool y @@ -499,6 +497,7 @@ config X86_INTEL_QUARK depends on X86_IO_APIC select IOSF_MBI select INTEL_IMR + select COMMON_CLK ---help--- Select to include support for Quark X1000 SoC. Say Y here if you have a Quark based system such as the Arduino @@ -890,7 +889,8 @@ config UP_LATE_INIT depends on !SMP && X86_LOCAL_APIC config X86_UP_APIC - bool "Local APIC support on uniprocessors" + bool "Local APIC support on uniprocessors" if !PCI_MSI + default PCI_MSI depends on X86_32 && !SMP && !X86_32_NON_STANDARD ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an @@ -902,10 +902,6 @@ config X86_UP_APIC performance counters), and the NMI watchdog which detects hard lockups. -config X86_UP_APIC_MSI - def_bool y - select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI - config X86_UP_IOAPIC bool "IO-APIC support on uniprocessors" depends on X86_UP_APIC @@ -924,8 +920,8 @@ config X86_LOCAL_APIC select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ config X86_IO_APIC - def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC - depends on X86_LOCAL_APIC + def_bool y + depends on X86_LOCAL_APIC || X86_UP_IOAPIC select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS @@ -1144,10 +1140,10 @@ config MICROCODE_OLD_INTERFACE depends on MICROCODE config MICROCODE_INTEL_EARLY - def_bool n + bool config MICROCODE_AMD_EARLY - def_bool n + bool config MICROCODE_EARLY bool "Early load microcode" @@ -1299,14 +1295,14 @@ config ARCH_DMA_ADDR_T_64BIT def_bool y depends on X86_64 || HIGHMEM64G -config DIRECT_GBPAGES - bool "Enable 1GB pages for kernel pagetables" if EXPERT - default y - depends on X86_64 +config X86_DIRECT_GBPAGES + def_bool y + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK ---help--- - Allow the kernel linear mapping to use 1GB pages on CPUs that - support it. This can improve the kernel's performance a tiny bit by - reducing TLB pressure. If in doubt, say "Y". + Certain kernel features effectively disable kernel + linear 1 GB mappings (even if the CPU otherwise + supports them), so don't confuse the user by printing + that we have them enabled. # Common NUMA Features config NUMA @@ -1746,14 +1742,11 @@ config KEXEC_VERIFY_SIG depends on KEXEC_FILE ---help--- This option makes kernel signature verification mandatory for - kexec_file_load() syscall. If kernel is signature can not be - verified, kexec_file_load() will fail. - - This option enforces signature verification at generic level. - One needs to enable signature verification for type of kernel - image being loaded to make sure it works. For example, enable - bzImage signature verification option to be able to load and - verify signatures of bzImage. Otherwise kernel loading will fail. + the kexec_file_load() syscall. + + In addition to that option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. config KEXEC_BZIMAGE_VERIFY_SIG bool "Enable bzImage signature verification support" diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 7083c16cccba..d7b1f655b3ef 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -14,13 +14,6 @@ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; -struct kaslr_setup_data { - __u64 next; - __u32 type; - __u32 len; - __u8 data[1]; -} kaslr_setup_data; - #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER0 0x40 #define I8254_CMD_READBACK 0xC0 @@ -302,28 +295,7 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled) -{ - struct setup_data *data; - - kaslr_setup_data.type = SETUP_KASLR; - kaslr_setup_data.len = 1; - kaslr_setup_data.next = 0; - kaslr_setup_data.data[0] = enabled; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - if (data) - data->next = (unsigned long)&kaslr_setup_data; - else - params->hdr.setup_data = (unsigned long)&kaslr_setup_data; - -} - -unsigned char *choose_kernel_location(struct boot_params *params, +unsigned char *choose_kernel_location(struct boot_params *boot_params, unsigned char *input, unsigned long input_size, unsigned char *output, @@ -335,17 +307,16 @@ unsigned char *choose_kernel_location(struct boot_params *params, #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { debug_putstr("KASLR disabled by default...\n"); - add_kaslr_setup_data(params, 0); goto out; } #else if (cmdline_find_option_bool("nokaslr")) { debug_putstr("KASLR disabled by cmdline...\n"); - add_kaslr_setup_data(params, 0); goto out; } #endif - add_kaslr_setup_data(params, 1); + + boot_params->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1d7fbbcc196d..8ef964ddc18e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -29,6 +29,7 @@ #include <asm/page_types.h> #include <asm/boot.h> #include <asm/asm-offsets.h> +#include <asm/bootparam.h> __HEAD ENTRY(startup_32) @@ -102,7 +103,7 @@ preferred_addr: * Test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 1f cli diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6b1766c6c082..b0c0d16ef58d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -31,6 +31,7 @@ #include <asm/msr.h> #include <asm/processor-flags.h> #include <asm/asm-offsets.h> +#include <asm/bootparam.h> __HEAD .code32 @@ -46,7 +47,7 @@ ENTRY(startup_32) * Test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 1f cli @@ -164,7 +165,7 @@ ENTRY(startup_32) /* After gdt is loaded */ xorl %eax, %eax lldt %ax - movl $0x20, %eax + movl $__BOOT_TSS, %eax ltr %ax /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 5903089c818f..a107b935e22f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, real_mode = rmode; + /* Clear it for solely in-kernel use */ + real_mode->hdr.loadflags &= ~KASLR_FLAG; + sanitize_boot_params(real_mode); if (real_mode->screen_info.orig_video_mode == 7) { @@ -401,8 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, - output, + output = choose_kernel_location(real_mode, input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index ee3576b2666b..89dd0d78013a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -57,7 +57,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* aslr.c */ -unsigned char *choose_kernel_location(struct boot_params *params, +unsigned char *choose_kernel_location(struct boot_params *boot_params, unsigned char *input, unsigned long input_size, unsigned char *output, @@ -66,7 +66,7 @@ unsigned char *choose_kernel_location(struct boot_params *params, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *params, +unsigned char *choose_kernel_location(struct boot_params *boot_params, unsigned char *input, unsigned long input_size, unsigned char *output, diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 493f3fd9f139..318b8465d302 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2) int delta = 0; while (*s1 || *s2) { - delta = *s2 - *s1; + delta = *s1 - *s2; if (delta) return delta; s1++; diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c index 748e8d06290a..aa8a96b052e3 100644 --- a/arch/x86/boot/video-mode.c +++ b/arch/x86/boot/video-mode.c @@ -22,10 +22,8 @@ /* * Common variables */ -int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ -u16 video_segment; +int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ int force_x, force_y; /* Don't query the BIOS for cols/rows */ - int do_restore; /* Screen contents changed during mode flip */ int graphic_mode; /* Graphic mode with linear frame buffer */ diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 43eda284d27f..05111bb8d018 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -17,6 +17,8 @@ #include "video.h" #include "vesa.h" +static u16 video_segment; + static void store_cursor_position(void) { struct biosregs ireg, oreg; diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 0bb25491262d..b54e0328c449 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -91,7 +91,6 @@ int mode_defined(u16 mode); /* video.c */ #define ADAPTER_VGA 2 extern int adapter; -extern u16 video_segment; extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ extern int do_restore; /* Restore screen contents */ extern int graphic_mode; /* Graphics mode with linear frame buffer */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 419819d6dab3..aaa1118bf01e 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -248,7 +248,7 @@ CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y -# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4c311ddd973b..315b86106572 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -243,7 +243,7 @@ CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y -# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 947c6bf52c33..54f60ab41c63 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1155,7 +1155,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); if (!src) return -ENOMEM; - assoc = (src + req->cryptlen + auth_tag_len); + assoc = (src + req->cryptlen); scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); scatterwalk_map_and_copy(assoc, req->assoc, 0, req->assoclen, 0); @@ -1180,7 +1180,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1); + scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1); kfree(src); } return retval; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 26d49ebae040..225be06edc80 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -178,7 +178,7 @@ continue_block: ## 2a) PROCESS FULL BLOCKS: ################################################################ full_block: - movq $128,%rax + movl $128,%eax lea 128*8*2(block_0), block_1 lea 128*8*3(block_0), block_2 add $128*8*1, block_0 diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index a039d21986a2..a350c990dc86 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk) movq R1, 8(%rsi) popq R1 - movq $1,%rax + movl $1,%eax ret ENDPROC(twofish_enc_blk) @@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk) movq R1, 8(%rsi) popq R1 - movq $1,%rax + movl $1,%eax ret ENDPROC(twofish_dec_blk) diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e785b422b766..bb635c641869 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,6 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o -obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index d0165c9a2932..c81d35e6c7f1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) } static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_ia32 __user *sc, - unsigned int *pax) + struct sigcontext_ia32 __user *sc) { unsigned int tmpflags, err = 0; void __user *buf; @@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, RELOAD_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); + COPY(dx); COPY(cx); COPY(ip); COPY(ax); /* Don't touch extended registers */ COPY_SEG_CPL3(cs); @@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, get_user_ex(tmp, &sc->fpstate); buf = compat_ptr(tmp); - - get_user_ex(*pax, &sc->ax); } get_user_catch(err); err |= restore_xstate_sig(buf, 1); + force_iret(); + return err; } @@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; - unsigned int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) + if (ia32_restore_sigcontext(regs, &frame->sc)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "32bit sigreturn"); @@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; sigset_t set; - unsigned int ax; frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); @@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "32bit rt sigreturn"); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebcab4ada..a821b1cd4fa7 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -30,24 +30,13 @@ .section .entry.text, "ax" - .macro IA32_ARG_FIXUP noebp=0 - movl %edi,%r8d - .if \noebp - .else - movl %ebp,%r9d - .endif - xchg %ecx,%esi - movl %ebx,%edi - movl %edx,%edx /* zero extension */ - .endm - - /* clobbers %eax */ - .macro CLEAR_RREGS offset=0, _r9=rax + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax - movq %rax,\offset+R11(%rsp) - movq %rax,\offset+R10(%rsp) - movq %\_r9,\offset+R9(%rsp) - movq %rax,\offset+R8(%rsp) + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) .endm /* @@ -60,14 +49,14 @@ * If it's -1 to make us punt the syscall, then (u32)-1 is still * an appropriately invalid value. */ - .macro LOAD_ARGS32 offset, _r9=0 + .macro LOAD_ARGS32 _r9=0 .if \_r9 - movl \offset+16(%rsp),%r9d + movl R9(%rsp),%r9d .endif - movl \offset+40(%rsp),%ecx - movl \offset+48(%rsp),%edx - movl \offset+56(%rsp),%esi - movl \offset+64(%rsp),%edi + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi movl %eax,%eax /* zero extension */ .endm @@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit) /* * 32bit SYSENTER instruction entry. * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp user stack - * 0(%ebp) Arg6 - * - * Interrupts off. - * + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code + * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. - */ + */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(kernel_stack), %rsp - addq $(KERNEL_STACK_OFFSET),%rsp + /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs, here we enable it straight after entry: + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ENABLE_INTERRUPTS(CLBR_NONE) - movl %ebp,%ebp /* zero extension */ - pushq_cfi $__USER32_DS - /*CFI_REL_OFFSET ss,0*/ - pushq_cfi %rbp - CFI_REL_OFFSET rsp,0 - pushfq_cfi - /*CFI_REL_OFFSET rflags,0*/ - movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d - CFI_REGISTER rip,r10 - pushq_cfi $__USER32_CS - /*CFI_REL_OFFSET cs,0*/ + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp movl %eax, %eax - pushq_cfi %r10 - CFI_REL_OFFSET rip,0 - pushq_cfi %rax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + CFI_REGISTER rip,r10 + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER32_DS /* pt_regs->ss */ + pushq_cfi %rbp /* pt_regs->sp */ + CFI_REL_OFFSET rsp,0 + pushfq_cfi /* pt_regs->flags */ + pushq_cfi $__USER32_CS /* pt_regs->cs */ + pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi_reg rax /* pt_regs->ax */ cld - SAVE_ARGS 0,1,0 - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ ASM_STAC 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) @@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target) * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. */ - testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) + testl $X86_EFLAGS_NT,EFLAGS(%rsp) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: - IA32_ARG_FIXUP + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS-ARGOFFSET(%rsp) - movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ - CFI_REGISTER rip,rdx - RESTORE_ARGS 0,24,0,0,0,0 + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp),%ecx /* User %eip */ + CFI_REGISTER rip,rcx + RESTORE_RSI_RDI + xorl %edx,%edx /* avoid info leaks */ xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 - xorq %r11,%r11 - popfq_cfi + movl EFLAGS(%rsp),%r11d /* User eflags */ /*CFI_RESTORE rflags*/ - popq_cfi %rcx /* User %esp */ - CFI_REGISTER rsp,rcx TRACE_IRQS_ON - ENABLE_INTERRUPTS_SYSEXIT32 + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp),%esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 CFI_RESTORE_STATE @@ -205,18 +247,18 @@ sysexit_from_sys_call: movl %ebx,%esi /* 2nd arg: 1st syscall arg */ movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + movl RAX(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -227,13 +269,13 @@ sysexit_from_sys_call: 1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ call __audit_syscall_exit - movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ + movq RAX(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz \exit - CLEAR_RREGS -ARGOFFSET + CLEAR_RREGS jmp int_with_check .endm @@ -253,16 +295,16 @@ sysenter_fix_flags: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz sysenter_auditsys #endif - SAVE_REST + SAVE_EXTRA_REGS CLEAR_RREGS movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target) /* * 32bit SYSCALL instruction entry. * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx return EIP - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg2 [note: not saved in the stack frame, should not be touched] - * %esp user stack - * 0(%esp) Arg6 - * - * Interrupts off. - * + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq PER_CPU_VAR(kernel_stack),%rsp - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: - */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8,0,0 - movl %eax,%eax /* zero extension */ - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER32_DS /* pt_regs->ss */ + pushq_cfi %r8 /* pt_regs->sp */ + CFI_REL_OFFSET rsp,0 + pushq_cfi %r11 /* pt_regs->flags */ + pushq_cfi $__USER32_CS /* pt_regs->cs */ + pushq_cfi %rcx /* pt_regs->ip */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rbp /* pt_regs->cx */ movl %ebp,%ecx - movq $__USER32_CS,CS-ARGOFFSET(%rsp) - movq $__USER32_DS,SS-ARGOFFSET(%rsp) - movq %r11,EFLAGS-ARGOFFSET(%rsp) - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - movq %r8,RSP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rsp,RSP-ARGOFFSET - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ - /* hardware stack frame is complete now */ + pushq_cfi_reg rax /* pt_regs->ax */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ ASM_STAC 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: - IA32_ARG_FIXUP 1 + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ cstar_dispatch: call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - RESTORE_ARGS 0,-ARG_SKIP,0,0,0 - movl RIP-ARGOFFSET(%rsp),%ecx + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp),%ecx CFI_REGISTER rip,rcx - movl EFLAGS-ARGOFFSET(%rsp),%r11d + movl EFLAGS(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ xorq %r10,%r10 xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON - movl RSP-ARGOFFSET(%rsp),%esp + movl RSP(%rsp),%esp CFI_RESTORE rsp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + */ USERGS_SYSRET32 - + #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: CFI_RESTORE_STATE - movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common - movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + movl R9(%rsp),%r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -368,17 +444,17 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif xchgl %r9d,%ebp - SAVE_REST - CLEAR_RREGS 0, r9 + SAVE_EXTRA_REGS + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ - RESTORE_REST + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS xchgl %ebp,%r9d cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ @@ -391,78 +467,94 @@ ia32_badarg: jmp ia32_sysret CFI_ENDPROC -/* - * Emulated IA32 system calls via int 0x80. +/* + * Emulated IA32 system calls via int 0x80. * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) * * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except %eax must be saved (but ptrace may violate that) + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). * Arguments are zero extended. For system calls that want sign extension and * take long arguments a wrapper is needed. Most calls can just be called * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ + * Assumes it is only called from user space and entered with interrupts off. + */ ENTRY(ia32_syscall) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-RIP - /*CFI_REL_OFFSET ss,SS-RIP*/ - CFI_REL_OFFSET rsp,RSP-RIP - /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ - /*CFI_REL_OFFSET cs,CS-RIP*/ - CFI_REL_OFFSET rip,RIP-RIP - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS + CFI_DEF_CFA rsp,5*8 + /*CFI_REL_OFFSET ss,4*8 */ + CFI_REL_OFFSET rsp,3*8 + /*CFI_REL_OFFSET rflags,2*8 */ + /*CFI_REL_OFFSET cs,1*8 */ + CFI_REL_OFFSET rip,0*8 + /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%eax - pushq_cfi %rax + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi_reg rax /* pt_regs->ax */ cld - /* note the registers are not zero extended to the sf. - this could be a problem. */ - SAVE_ARGS 0,1,0 - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: - IA32_ARG_FIXUP + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) ia32_ret_from_sys_call: - CLEAR_RREGS -ARGOFFSET - jmp int_ret_from_sys_call + CLEAR_RREGS + jmp int_ret_from_sys_call -ia32_tracesys: - SAVE_REST +ia32_tracesys: + SAVE_EXTRA_REGS CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call END(ia32_syscall) ia32_badsys: - movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $0,ORIG_RAX(%rsp) movq $-ENOSYS,%rax jmp ia32_sysret @@ -479,8 +571,6 @@ GLOBAL(\label) PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_execve, compat_sys_execve - PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork @@ -492,24 +582,23 @@ GLOBAL(stub32_clone) ALIGN ia32_ptregs_common: - popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-ARGOFFSET - CFI_REL_OFFSET rax,RAX-ARGOFFSET - CFI_REL_OFFSET rcx,RCX-ARGOFFSET - CFI_REL_OFFSET rdx,RDX-ARGOFFSET - CFI_REL_OFFSET rsi,RSI-ARGOFFSET - CFI_REL_OFFSET rdi,RDI-ARGOFFSET - CFI_REL_OFFSET rip,RIP-ARGOFFSET -/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ -/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP-ARGOFFSET -/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ - SAVE_REST + CFI_DEF_CFA rsp,SIZEOF_PTREGS + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP +/* CFI_REL_OFFSET cs,CS*/ +/* CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP +/* CFI_REL_OFFSET ss,SS*/ + SAVE_EXTRA_REGS 8 call *%rax - RESTORE_REST - jmp ia32_sysret /* misbalances the return cache */ + RESTORE_EXTRA_REGS 8 + ret CFI_ENDPROC END(ia32_ptregs_common) diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c deleted file mode 100644 index 51ecd5b4e787..000000000000 --- a/arch/x86/ia32/nosyscall.c +++ /dev/null @@ -1,7 +0,0 @@ -#include <linux/kernel.h> -#include <linux/errno.h> - -long compat_ni_syscall(void) -{ - return -ENOSYS; -} diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 8e0ceecdc957..719cd702b0a4 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO - "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) { diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c deleted file mode 100644 index 4754ba0f5d9f..000000000000 --- a/arch/x86/ia32/syscall_ia32.c +++ /dev/null @@ -1,25 +0,0 @@ -/* System call table for ia32 emulation. */ - -#include <linux/linkage.h> -#include <linux/sys.h> -#include <linux/cache.h> -#include <asm/asm-offsets.h> - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; -#include <asm/syscalls_32.h> -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = compat, - -typedef void (*sys_call_ptr_t)(void); - -extern void compat_ni_syscall(void); - -const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, -#include <asm/syscalls_32.h> -}; diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..bdf02eeee765 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,12 +18,63 @@ .endm #endif -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len + .byte \pad_len +.endm + +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..ba32af062f61 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,8 +48,9 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ -}; + u8 replacementlen; /* length of new instruction */ + u8 padlen; /* length of build-time padding */ +} __packed; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); @@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" +#define b_replacement(num) "664"#num +#define e_replacement(num) "665"#num -#define b_replacement(number) "663"#number -#define e_replacement(number) "664"#number +#define alt_end_marker "663" +#define alt_slen "662b-661b" +#define alt_pad_len alt_end_marker"b-662b" +#define alt_total_slen alt_end_marker"b-661b" +#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" -#define alt_slen "662b-661b" -#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" +#define __OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" -#define ALTINSTR_ENTRY(feature, number) \ +#define OLDINSTR(oldinstr, num) \ + __OLDINSTR(oldinstr, num) \ + alt_end_marker ":\n" + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas works with s32s. + */ +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" + +/* + * Pad the second replacement alternative with additional NOPs if it is + * additionally longer than the first replacement alternative. + */ +#define OLDINSTR_2(oldinstr, num1, num2) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ + "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ + alt_end_marker ":\n" + +#define ALTINSTR_ENTRY(feature, num) \ " .long 661b - .\n" /* label */ \ - " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte " alt_slen "\n" /* source len */ \ - " .byte " alt_rlen(number) "\n" /* replacement len */ - -#define DISCARD_ENTRY(number) /* rlen <= slen */ \ - " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + " .byte " alt_total_slen "\n" /* source len */ \ + " .byte " alt_rlen(num) "\n" /* replacement len */ \ + " .byte " alt_pad_len "\n" /* pad len */ -#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ - b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" +#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ + b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - OLDINSTR(oldinstr) \ + OLDINSTR(oldinstr, 1) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR(oldinstr) \ + OLDINSTR_2(oldinstr, 1, 2) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - DISCARD_ENTRY(2) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ @@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative(oldinstr, newinstr, feature) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + /* * Alternative inline assembly with input. * diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index efc3b22d896e..976b86a325e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, + alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } @@ -204,7 +204,6 @@ extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); -extern int verify_local_APIC(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -95,13 +95,11 @@ do { \ * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined * code region. - * - * (Could use an alternative three way for this if there was one.) */ static __always_inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1f1297b46f83..1c8b50edb2db 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 - -/* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ -/* end of arguments */ - -/* cpu exception frame or undefined in case of fast syscall: */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 - -#define ARGOFFSET R11 - - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 - subq $9*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 9*8+\addskip - movq_cfi rdi, 8*8 - movq_cfi rsi, 7*8 - movq_cfi rdx, 6*8 - - .if \save_rcx - movq_cfi rcx, 5*8 - .endif +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + subq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET 15*8+\addskip + .endm - .if \rax_enosys - movq $-ENOSYS, 4*8(%rsp) - .else - movq_cfi rax, 4*8 + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 + movq_cfi r11, 6*8+\offset .endif - - .if \save_r891011 - movq_cfi r8, 3*8 - movq_cfi r9, 2*8 - movq_cfi r10, 1*8 - movq_cfi r11, 0*8 + .if \r8910 + movq_cfi r10, 7*8+\offset + movq_cfi r9, 8*8+\offset + movq_cfi r8, 9*8+\offset + .endif + .if \rax + movq_cfi rax, 10*8+\offset + .endif + .if \rcx + movq_cfi rcx, 11*8+\offset .endif + movq_cfi rdx, 12*8+\offset + movq_cfi rsi, 13*8+\offset + movq_cfi rdi, 14*8+\offset + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 + .endm + + .macro SAVE_EXTRA_REGS offset=0 + movq_cfi r15, 0*8+\offset + movq_cfi r14, 1*8+\offset + movq_cfi r13, 2*8+\offset + movq_cfi r12, 3*8+\offset + movq_cfi rbp, 4*8+\offset + movq_cfi rbx, 5*8+\offset + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq_cfi rbp, 4*8+\offset + .endm + .macro RESTORE_EXTRA_REGS offset=0 + movq_cfi_restore 0*8+\offset, r15 + movq_cfi_restore 1*8+\offset, r14 + movq_cfi_restore 2*8+\offset, r13 + movq_cfi_restore 3*8+\offset, r12 + movq_cfi_restore 4*8+\offset, rbp + movq_cfi_restore 5*8+\offset, rbx .endm -#define ARG_SKIP (9*8) + .macro ZERO_EXTRA_REGS + xorl %r15d, %r15d + xorl %r14d, %r14d + xorl %r13d, %r13d + xorl %r12d, %r12d + xorl %ebp, %ebp + xorl %ebx, %ebx + .endm - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ - rstor_r8910=1, rstor_rdx=1 + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 0*8, r11 + movq_cfi_restore 6*8, r11 .endif - .if \rstor_r8910 - movq_cfi_restore 1*8, r10 - movq_cfi_restore 2*8, r9 - movq_cfi_restore 3*8, r8 + movq_cfi_restore 7*8, r10 + movq_cfi_restore 8*8, r9 + movq_cfi_restore 9*8, r8 .endif - .if \rstor_rax - movq_cfi_restore 4*8, rax + movq_cfi_restore 10*8, rax .endif - .if \rstor_rcx - movq_cfi_restore 5*8, rcx + movq_cfi_restore 11*8, rcx .endif - .if \rstor_rdx - movq_cfi_restore 6*8, rdx - .endif - - movq_cfi_restore 7*8, rsi - movq_cfi_restore 8*8, rdi - - .if ARG_SKIP+\addskip > 0 - addq $ARG_SKIP+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) + movq_cfi_restore 12*8, rdx .endif + movq_cfi_restore 13*8, rsi + movq_cfi_restore 14*8, rdi .endm - - .macro LOAD_ARGS offset, skiprax=0 - movq \offset(%rsp), %r11 - movq \offset+8(%rsp), %r10 - movq \offset+16(%rsp), %r9 - movq \offset+24(%rsp), %r8 - movq \offset+40(%rsp), %rcx - movq \offset+48(%rsp), %rdx - movq \offset+56(%rsp), %rsi - movq \offset+64(%rsp), %rdi - .if \skiprax - .else - movq \offset+72(%rsp), %rax - .endif + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 .endm - -#define REST_SKIP (6*8) - - .macro SAVE_REST - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq_cfi rbx, 5*8 - movq_cfi rbp, 4*8 - movq_cfi r12, 3*8 - movq_cfi r13, 2*8 - movq_cfi r14, 1*8 - movq_cfi r15, 0*8 + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 .endm - - .macro RESTORE_REST - movq_cfi_restore 0*8, r15 - movq_cfi_restore 1*8, r14 - movq_cfi_restore 2*8, r13 - movq_cfi_restore 3*8, r12 - movq_cfi_restore 4*8, rbp - movq_cfi_restore 5*8, rbx - addq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET -(REST_SKIP) + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 .endm - - .macro SAVE_ALL - SAVE_ARGS - SAVE_REST + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 .endm - .macro RESTORE_ALL addskip=0 - RESTORE_REST - RESTORE_ARGS 1, \addskip + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + addq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp @@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx .endm .macro RESTORE_ALL - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg ebx + popl_cfi_reg ecx + popl_cfi_reg edx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebp + popl_cfi_reg eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = this_cpu_read(old_rsp) - 128; + sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..854c04b3c9c2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -231,7 +231,9 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ @@ -418,6 +420,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* 1: do replace */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); @@ -432,6 +435,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); @@ -457,6 +461,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -483,31 +488,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { #ifdef CC_HAVE_ASM_GOTO -/* - * We need to spell the jumps to the compiler because, depending on the offset, - * the replacement jump can be bigger than the original jump, and this we cannot - * have. Thus, we force the jump to the widest, 4-byte, signed relative - * offset even though the last would often fit in less bytes. - */ - asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ + " .long 4f - .\n" /* repl offset */ " .word %P1\n" /* always replace */ - " .byte 2b - 1b\n" /* src len */ - " .byte 4f - 3f\n" /* repl len */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" - "3: .byte 0xe9\n .long %l[t_no] - 2b\n" - "4:\n" + "4: jmp %l[t_no]\n" + "5:\n" ".previous\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* src len */ + " .byte 3b - 1b\n" /* src len */ " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" : : "i" (bit), "i" (X86_FEATURE_ALWAYS) : : t_dynamic, t_no); @@ -527,6 +531,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P2\n" /* always replace */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -541,6 +546,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P1\n" /* feature bit */ " .byte 4b - 3b\n" /* src len */ " .byte 6f - 5f\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a94b82e8f156..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ -#define set_intr_gate(n, addr) \ +#define set_intr_gate_notrace(n, addr) \ do { \ BUG_ON((unsigned)n > 0xFF); \ _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ __KERNEL_CS); \ + } while (0) + +#define set_intr_gate(n, addr) \ + do { \ + set_intr_gate_notrace(n, addr); \ _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 0, 0, __KERNEL_CS); \ } while (0) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -86,11 +86,23 @@ CFI_ADJUST_CFA_OFFSET 8 .endm + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popq_cfi reg popq \reg CFI_ADJUST_CFA_OFFSET -8 .endm + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + .macro pushfq_cfi pushfq CFI_ADJUST_CFA_OFFSET 8 @@ -116,11 +128,23 @@ CFI_ADJUST_CFA_OFFSET 4 .endm + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popl_cfi reg popl \reg CFI_ADJUST_CFA_OFFSET -4 .endm + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + .macro pushfl_cfi pushfl CFI_ADJUST_CFA_OFFSET 4 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 25bce45c6fc4..3738b138b843 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -2,6 +2,8 @@ #define _ASM_X86_EFI_H #include <asm/i387.h> +#include <asm/pgtable.h> + /* * We map the EFI regions needed for runtime services non-contiguously, * with preserved alignment on virtual addresses starting from -4G down @@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, extern struct efi_scratch efi_scratch; extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); -extern void __init efi_call_phys_prolog(void); -extern void __init efi_call_phys_epilog(void); +extern pgd_t * __init efi_call_phys_prolog(void); +extern void __init efi_call_phys_epilog(pgd_t *save_pgd); extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..935588d95c82 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,10 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - regs->ax = regs->bx = regs->cx = regs->dx = 0; - regs->si = regs->di = regs->bp = 0; + /* Commented-out registers are cleared in stub_execve */ + /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; + regs->si = regs->di /*= regs->bp*/ = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; + /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; @@ -365,6 +366,7 @@ enum align_flags { struct va_alignment { int flags; unsigned long mask; + unsigned long bits; } ____cacheline_aligned; extern struct va_alignment va_align; diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 0dbc08282291..da5e96756570 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft); static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif +/* + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + +/* + * Used to indicate that the FPU state in memory is newer than the FPU + * state in registers, and the FPU state should be reloaded next time the + * task is run. Only safe on the current task, or non-running tasks. + */ +static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) +{ + tsk->thread.fpu.last_cpu = ~0; +} + +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == this_cpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + static inline int is_ia32_compat_frame(void) { return config_enabled(CONFIG_IA32_EMULATION) && @@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void) static inline void fx_finit(struct i387_fxsave_struct *fx) { - memset(fx, 0, xstate_size); fx->cwd = 0x37f; fx->mxcsr = MXCSR_DEFAULT; } @@ -351,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) __thread_set_has_fpu(tsk); } -static inline void __drop_fpu(struct task_struct *tsk) +static inline void drop_fpu(struct task_struct *tsk) { + /* + * Forget coprocessor state.. + */ + preempt_disable(); + tsk->thread.fpu_counter = 0; + if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" @@ -360,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk) _ASM_EXTABLE(1b, 2b)); __thread_fpu_end(tsk); } + + clear_stopped_child_used_math(tsk); + preempt_enable(); } -static inline void drop_fpu(struct task_struct *tsk) +static inline void restore_init_xstate(void) { - /* - * Forget coprocessor state.. - */ - preempt_disable(); - tsk->thread.fpu_counter = 0; - __drop_fpu(tsk); - clear_used_math(); - preempt_enable(); + if (use_xsave()) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); } -static inline void drop_init_fpu(struct task_struct *tsk) +/* + * Reset the FPU state in the eager case and drop it in the lazy case (later use + * will reinit it). + */ +static inline void fpu_reset_state(struct task_struct *tsk) { if (!use_eager_fpu()) drop_fpu(tsk); - else { - if (use_xsave()) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); - } + else + restore_init_xstate(); } /* @@ -400,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk) */ typedef struct { int preload; } fpu_switch_t; -/* - * Must be run with preemption disabled: this clears the fpu_owner_task, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. - */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_owner_task, cpu) = NULL; -} - -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == this_cpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; @@ -426,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = tsk_used_math(new) && (use_eager_fpu() || - new->thread.fpu_counter > 5); + fpu.preload = tsk_used_math(new) && + (use_eager_fpu() || new->thread.fpu_counter > 5); + if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) - cpu = ~0; - old->thread.fpu.last_cpu = cpu; - old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + task_disable_lazy_fpu_restore(old); + else + old->thread.fpu.last_cpu = cpu; + + /* But leave fpu_owner_task! */ + old->thread.fpu.has_fpu = 0; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -443,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->thread.fpu_counter = 0; - old->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(old); if (fpu.preload) { new->thread.fpu_counter++; - if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -466,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { if (fpu.preload) { if (unlikely(restore_fpu_checking(new))) - drop_init_fpu(new); + fpu_reset_state(new); } } @@ -495,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame) } /* - * Need to be preemption-safe. + * Needs to be preemption-safe. * * NOTE! user_fpu_begin() must be used only immediately before restoring - * it. This function does not do any save/restore on their own. + * the save state. It does not do any saving/restoring on its own. In + * lazy FPU mode, it is just an optimization to avoid a #NM exception, + * the task can lose the FPU right after preempt_enable(). */ static inline void user_fpu_begin(void) { @@ -520,24 +540,6 @@ static inline void __save_fpu(struct task_struct *tsk) } /* - * These disable preemption on their own and are safe - */ -static inline void save_init_fpu(struct task_struct *tsk) -{ - WARN_ON_ONCE(!__thread_has_fpu(tsk)); - - if (use_eager_fpu()) { - __save_fpu(tsk); - return; - } - - preempt_disable(); - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - preempt_enable(); -} - -/* * i387 state interaction */ static inline unsigned short get_fpu_cwd(struct task_struct *tsk) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9662290e0b20..e9571ddabc4f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *); extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif -extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR - - FIRST_EXTERNAL_VECTOR])(void); +extern char irq_entries_start[]; #ifdef CONFIG_TRACING -#define trace_interrupt interrupt +#define trace_irq_entries_start irq_entries_start #endif #define VECTOR_UNDEFINED (-1) diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 47f29b1d1846..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -69,7 +69,7 @@ struct insn { const insn_byte_t *next_byte; }; -#define MAX_INSN_SIZE 16 +#define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index f42a04735a0a..e37d6b3ad983 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -79,11 +79,12 @@ struct iommu_table_entry { * d). Similar to the 'init', except that this gets called from pci_iommu_init * where we do have a memory allocator. * - * The standard vs the _FINISH differs in that the _FINISH variant will - * continue detecting other IOMMUs in the call list after the - * the detection routine returns a positive number. The _FINISH will - * stop the execution chain. Both will still call the 'init' and - * 'late_init' functions if they are set. + * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant + * in that the former will continue detecting other IOMMUs in the call + * list after the detection routine returns a positive number, while the + * latter will stop the execution chain upon first successful detection. + * Both variants will still call the 'init' and 'late_init' functions if + * they are set. */ #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..b77f5edb03b0 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void) #define USERGS_SYSRET32 \ swapgs; \ sysretl -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - swapgs; \ - sti; \ - sysexit #else #define INTERRUPT_RETURN iret @@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } +#endif /* !__ASSEMBLY__ */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else - -#ifdef CONFIG_X86_64 -#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_X86_64 +# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +# define LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_REST; \ - LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ + call lockdep_sys_exit_thunk; \ cli; \ TRACE_IRQS_OFF; - -#else -#define ARCH_LOCKDEP_SYS_EXIT \ +# else +# define LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ pushl %edx; \ @@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void) popl %edx; \ popl %ecx; \ popl %eax; - -#define ARCH_LOCKDEP_SYS_EXIT_IRQ -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +# define LOCKDEP_SYS_EXIT_IRQ +# endif #else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ -# else # define LOCKDEP_SYS_EXIT # define LOCKDEP_SYS_EXIT_IRQ -# endif - +#endif #endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 6a2cefb4395a..a4c1cf7e93f8 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H -#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ #include <linux/stringify.h> #include <linux/types.h> @@ -30,8 +30,6 @@ l_yes: return true; } -#endif /* __KERNEL__ */ - #ifdef CONFIG_X86_64 typedef u64 jump_label_t; #else @@ -44,4 +42,5 @@ struct jump_entry { jump_label_t key; }; +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a236e39cc385..dea2e7e962e3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -81,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); } -#define SELECTOR_TI_MASK (1 << 2) -#define SELECTOR_RPL_MASK 0x03 - -#define IOPL_SHIFT 12 - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 @@ -345,6 +340,7 @@ struct kvm_pmu { enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, + KVM_DEBUGREG_RELOAD = 4, }; struct kvm_vcpu_arch { @@ -431,6 +427,9 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + + int maxphyaddr; + /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; @@ -550,11 +549,20 @@ struct kvm_arch_memory_slot { struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; +/* + * We use as the mode the number of bits allocated in the LDR for the + * logical processor ID. It happens that these are all powers of two. + * This makes it is very easy to detect cases where the APICs are + * configured for multiple modes; in that case, we cannot use the map and + * hence cannot use kvm_irq_delivery_to_apic_fast either. + */ +#define KVM_APIC_MODE_XAPIC_CLUSTER 4 +#define KVM_APIC_MODE_XAPIC_FLAT 8 +#define KVM_APIC_MODE_X2APIC 16 + struct kvm_apic_map { struct rcu_head rcu; - u8 ldr_bits; - /* fields bellow are used to decode ldr values in different modes */ - u32 cid_shift, cid_mask, lid_mask, broadcast; + u8 mode; struct kvm_lapic *phys_map[256]; /* first index is cluster id second is cpu id in a cluster */ struct kvm_lapic *logical_map[16][16]; @@ -859,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); +void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, @@ -933,6 +943,7 @@ struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); +int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1128,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index e62cf897f781..c1adf33fdd0d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void) static inline bool kvm_para_available(void) { - return 0; + return false; } static inline unsigned int kvm_arch_para_features(void) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9b3de99dc004..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -116,6 +116,12 @@ struct mca_config { u32 rip_msr; }; +struct mce_vendor_flags { + __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ + __reserved_0 : 63; +}; +extern struct mce_vendor_flags mce_flags; + extern struct mca_config mca_cfg; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -128,9 +134,11 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_vendor_init_severity(void) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { - MCP_TIMESTAMP = (1 << 0), /* log time stamp */ - MCP_UC = (1 << 1), /* log uncorrected errors */ - MCP_DONTLOG = (1 << 2), /* only clear, don't log */ + MCP_TIMESTAMP = BIT(0), /* log time stamp */ + MCP_UC = BIT(1), /* log uncorrected errors */ + MCP_DONTLOG = BIT(2), /* only clear, don't log */ }; -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 201b520521ed..2fb20d6f7e23 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {} #ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly from CPUID. + */ +static inline int x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static inline unsigned int __x86_family(unsigned int sig) +{ + unsigned int x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static inline unsigned int x86_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + return __x86_family(eax); +} + +static inline unsigned int x86_model(unsigned int sig) +{ + unsigned int x86, model; + + x86 = __x86_family(sig); + + model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + model += ((sig >> 16) & 0xf) << 4; + + return model; +} + extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index dd4c20043ce7..2b9209c46ca9 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -56,12 +56,15 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int -get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev); +extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); extern int microcode_sanity_check(void *mc, int print_err); -extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev); -extern int -update_match_revision(struct microcode_header_intel *mc_header, int rev); +extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); + +static inline int +revision_is_newer(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a1410db38a1a..653dfa7662e1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 95e11f79f123..f97fbe3abb67 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,8 +51,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern bool kaslr_enabled; - static inline phys_addr_t get_max_mapped(void) { return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 965c47d254aa..5f6051d5d139 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -976,11 +976,6 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) - -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195dae425..164e3f8d3c3d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); +extern bool mp_should_keep_irq(struct device *dev); + struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..d2203b5d9538 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -210,8 +210,23 @@ struct x86_hw_tss { unsigned long sp0; unsigned short ss0, __ss0h; unsigned long sp1; - /* ss1 caches MSR_IA32_SYSENTER_CS: */ - unsigned short ss1, __ss1h; + + /* + * We don't use ring 1, so ss1 is a convenient scratch space in + * the same cacheline as sp0. We use ss1 to cache the value in + * MSR_IA32_SYSENTER_CS. When we context switch + * MSR_IA32_SYSENTER_CS, we first check if the new value being + * written matches ss1, and, if it's not, then we wrmsr the new + * value and update ss1. + * + * The only reason we context switch MSR_IA32_SYSENTER_CS is + * that we set it to zero in vm86 tasks to avoid corrupting the + * stack if we were to go through the sysenter path from vm86 + * mode. + */ + unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ + + unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; @@ -276,13 +291,17 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* - * .. and then another 0x100 bytes for the emergency kernel stack: + * Space for the temporary SYSENTER stack: */ - unsigned long stack[64]; + unsigned long SYSENTER_stack[64]; } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); + +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#endif /* * Save the original ist values for checking stack pointers during debugging @@ -474,7 +493,6 @@ struct thread_struct { #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else - unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; @@ -564,6 +582,16 @@ static inline void native_swapgs(void) #endif } +static inline unsigned long current_top_of_stack(void) +{ +#ifdef CONFIG_X86_64 + return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +#else + /* sp0 on x86_32 is special in and around vm86 mode. */ + return this_cpu_read_stable(cpu_current_top_of_stack); +#endif +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -761,10 +789,10 @@ extern char ignore_fpu_irq; #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -# define BASE_PREFETCH ASM_NOP4 +# define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 %P1" #endif /* @@ -775,10 +803,9 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchnta (%1)", + alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, - "r" (x)); + "m" (*(const char *)x)); } /* @@ -788,10 +815,9 @@ static inline void prefetch(const void *x) */ static inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); + alternative_input(BASE_PREFETCH, "prefetchw %P1", + X86_FEATURE_3DNOWPREFETCH, + "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) @@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x) prefetchw(x); } +#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX STACK_TOP #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .sp0 = TOP_OF_INIT_STACK, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessible even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers @@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ - __regs__ - 1; \ +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ }) #define KSTK_ESP(task) (task_pt_regs(task)->sp) @@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - -#define INIT_TSS { \ - .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ + .sp0 = TOP_OF_INIT_STACK \ } /* @@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); -/* - * User space RSP while inside the SYSCALL fast path - */ -DECLARE_PER_CPU(unsigned long, old_rsp); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..19507ffa5d28 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -31,13 +31,17 @@ struct pt_regs { #else /* __i386__ */ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -47,9 +51,12 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_ax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; @@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) } /* - * user_mode_vm(regs) determines whether a register set came from user mode. - * This is true if V8086 mode was enabled OR if the register set was from - * protected mode with RPL-3 CS value. This tricky test checks that with - * one comparison. Many places in the kernel can bypass this full check - * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + * user_mode(regs) determines whether a register set came from user + * mode. On x86_32, this is true if V8086 mode was enabled OR if the + * register set was from protected mode with RPL-3 CS value. This + * tricky test checks that with one comparison. + * + * On x86_64, vm86 mode is mercifully nonexistent, and we don't need + * the extra check. */ static inline int user_mode(struct pt_regs *regs) { @@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs) #endif } -static inline int user_mode_vm(struct pt_regs *regs) -{ -#ifdef CONFIG_X86_32 - return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= - USER_RPL; -#else - return user_mode(regs); -#endif -} - static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 @@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #endif } -#define current_user_stack_pointer() this_cpu_read(old_rsp) -/* ia32 vs. x32 difference */ -#define compat_user_stack_pointer() \ - (test_thread_flag(TIF_IA32) \ - ? current_pt_regs()->sp \ - : this_cpu_read(old_rsp)) +#define current_user_stack_pointer() current_pt_regs()->sp +#define compat_user_stack_pointer() current_pt_regs()->sp #endif #ifdef CONFIG_X86_32 @@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, */ #define arch_ptrace_stop_needed(code, info) \ ({ \ - set_thread_flag(TIF_NOTIFY_RESUME); \ + force_iret(); \ false; \ }) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..25b1cc07d496 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; + u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index db257a58571f..5a9856eb12ba 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -3,8 +3,10 @@ #include <linux/const.h> -/* Constructor for a conventional segment GDT (or LDT) entry */ -/* This is a macro so it can be used in initializers */ +/* + * Constructor for a conventional segment GDT (or LDT) entry. + * This is a macro so it can be used in initializers. + */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ @@ -12,198 +14,228 @@ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) -/* Simple and small GDT entries for booting only */ +/* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) +#define GDT_ENTRY_BOOT_DS 3 +#define GDT_ENTRY_BOOT_TSS 4 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) +#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) + +/* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_RPL_MASK 0x3 -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) +/* User mode is privilege level 3: */ +#define USER_RPL 0x3 -#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) -#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ +#define SEGMENT_TI_MASK 0x4 +/* LDT segment has TI set ... */ +#define SEGMENT_LDT 0x4 +/* ... GDT has it cleared */ +#define SEGMENT_GDT 0x0 -#define SEGMENT_RPL_MASK 0x3 /* - * Bottom two bits of selector give the ring - * privilege level - */ -#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ -#define USER_RPL 0x3 /* User mode is privilege level 3 */ -#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ -#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ +#define GDT_ENTRY_INVALID_SEG 0 #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: * - * 0 - null + * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * - * 4 - unused <==== new cacheline + * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] - * 8 - TLS segment #3 + * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * - * 12 - kernel code segment <==== new cacheline + * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS - * 16 - TSS + * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support - * 20 - PNPBIOS support + * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support - * 24 - APM BIOS support + * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - stack_canary-20 [ for stack protector ] + * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ -#define GDT_ENTRY_TLS_MIN 6 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_KERNEL_CS 12 +#define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 - #define GDT_ENTRY_DEFAULT_USER_DS 15 +#define GDT_ENTRY_TSS 16 +#define GDT_ENTRY_LDT 17 +#define GDT_ENTRY_PNPBIOS_CS32 18 +#define GDT_ENTRY_PNPBIOS_CS16 19 +#define GDT_ENTRY_PNPBIOS_DS 20 +#define GDT_ENTRY_PNPBIOS_TS1 21 +#define GDT_ENTRY_PNPBIOS_TS2 22 +#define GDT_ENTRY_APMBIOS_BASE 23 + +#define GDT_ENTRY_ESPFIX_SS 26 +#define GDT_ENTRY_PERCPU 27 +#define GDT_ENTRY_STACK_CANARY 28 + +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 -#define GDT_ENTRY_KERNEL_BASE (12) +/* + * Number of entries in the GDT table: + */ +#define GDT_ENTRIES 32 -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) +/* + * Segment selector values corresponding to the above entries: + */ -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) +/* segment for calling fn: */ +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) +/* code segment for BIOS: */ +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) +/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) +/* data segment for BIOS: */ +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) +/* transfer data segment: */ +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) +/* another data segment: */ +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) #ifdef CONFIG_SMP -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else -#define __KERNEL_PERCPU 0 +# define __KERNEL_PERCPU 0 #endif -#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) #ifdef CONFIG_CC_STACKPROTECTOR -#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) +# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) #else -#define __KERNEL_STACK_CANARY 0 +# define __KERNEL_STACK_CANARY 0 #endif -#define GDT_ENTRY_DOUBLEFAULT_TSS 31 - -/* - * The GDT has 32 entries - */ -#define GDT_ENTRIES 32 +#else /* 64-bit: */ -/* The PnP BIOS entries in the GDT */ -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) - -/* The PnP BIOS selectors */ -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ +#include <asm/cache.h> +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 /* - * Matching rules for certain types of segments. + * We cannot use the same code segment descriptor for user and kernel mode, + * not even in long flat mode, because of different DPL. + * + * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes + * selectors: + * + * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, + * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, + * + * ss = STAR.SYSRET_CS+8 (in either case) + * + * thus USER_DS should be between 32-bit and 64-bit code selectors: */ +#define GDT_ENTRY_DEFAULT_USER32_CS 4 +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define GDT_ENTRY_DEFAULT_USER_CS 6 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ -#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) - +/* Needs two entries */ +#define GDT_ENTRY_TSS 8 +/* Needs two entries */ +#define GDT_ENTRY_LDT 10 -#else -#include <asm/cache.h> - -#define GDT_ENTRY_KERNEL32_CS 1 -#define GDT_ENTRY_KERNEL_CS 2 -#define GDT_ENTRY_KERNEL_DS 3 +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 -#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) +/* Abused to load per CPU data from limit */ +#define GDT_ENTRY_PER_CPU 15 /* - * we cannot use the same code segment descriptor for user and kernel - * -- not even in the long flat mode, because of different DPL /kkeil - * The segment offset needs to contain a RPL. Grr. -AK - * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) + * Number of entries in the GDT table: */ -#define GDT_ENTRY_DEFAULT_USER32_CS 4 -#define GDT_ENTRY_DEFAULT_USER_DS 5 -#define GDT_ENTRY_DEFAULT_USER_CS 6 -#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) -#define __USER32_DS __USER_DS - -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 /* needs two entries */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 - -#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) +#define GDT_ENTRIES 16 -/* TLS indexes for 64bit - hardcoded in arch_prctl */ -#define FS_TLS 0 -#define GS_TLS 1 - -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - -#define GDT_ENTRIES 16 +/* + * Segment selector values corresponding to the above entries: + * + * Note, selectors also need to have a correct RPL, + * expressed with the +3 value for user-space selectors: + */ +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER32_DS __USER_DS +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) + +/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ +#define FS_TLS 0 +#define GS_TLS 1 + +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) #endif -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) #ifndef CONFIG_PARAVIRT -#define get_kernel_rpl() 0 +# define get_kernel_rpl() 0 #endif -#define IDT_ENTRIES 256 -#define NUM_EXCEPTION_VECTORS 32 -/* Bitmask of exception vectors which push an error code on the stack */ -#define EXCEPTION_ERRCODE_MASK 0x00027d00 -#define GDT_SIZE (GDT_ENTRIES * 8) -#define GDT_ENTRY_TLS_ENTRIES 3 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) +#define IDT_ENTRIES 256 +#define NUM_EXCEPTION_VECTORS 32 + +/* Bitmask of exception vectors which push an error code on the stack: */ +#define EXCEPTION_ERRCODE_MASK 0x00027d00 + +#define GDT_SIZE (GDT_ENTRIES*8) +#define GDT_ENTRY_TLS_ENTRIES 3 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ + extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; #ifdef CONFIG_TRACING -#define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handlers early_idt_handlers #endif /* @@ -228,37 +260,30 @@ do { \ } while (0) /* - * Save a segment register away + * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") /* - * x86_32 user gs accessors. + * x86-32 user GS accessors: */ #ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ +# ifdef CONFIG_X86_32_LAZY_GS +# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) +# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +# define task_user_gs(tsk) ((tsk)->thread.gs) +# define lazy_save_gs(v) savesegment(gs, (v)) +# define lazy_load_gs(v) loadsegment(gs, (v)) +# else /* X86_32_LAZY_GS */ +# define get_user_gs(regs) (u16)((regs)->gs) +# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +# define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +# define lazy_save_gs(v) do { } while (0) +# define lazy_load_gs(v) do { } while (0) +# endif /* X86_32_LAZY_GS */ #endif /* X86_32 */ -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ff4e7b236e21..f69e06b283fb 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { } */ extern struct boot_params boot_params; +static inline bool kaslr_enabled(void) +{ + return !!(boot_params.hdr.loadflags & KASLR_FLAG); +} + /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; + unsigned short __pad2; /* Was called gs, but was always zero. */ + unsigned short __pad1; /* Was called fs, but was always zero. */ + unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 7a958164088c..89db46752a8f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,9 +13,7 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax); +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -27,23 +27,11 @@ #ifdef CONFIG_X86_SMAP -#define ASM_CLAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_CLAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection - -#define ASM_STAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_STAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection +#define ASM_CLAC \ + ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + +#define ASM_STAC \ + ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -61,20 +49,20 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..81d02fc7dafa 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); +void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6a4b00fafb00..aeb4666e0c0a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#include <asm/nops.h> + static inline void native_clts(void) { asm volatile("clts"); @@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p) "+m" (*(volatile char __force *)__p)); } +static inline void clwb(volatile void *__p) +{ + volatile struct { char x[64]; } *p = __p; + + asm volatile(ALTERNATIVE_2( + ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", + ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ + X86_FEATURE_CLFLUSHOPT, + ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ + X86_FEATURE_CLWB) + : [p] "+m" (*p) + : [pax] "a" (p)); +} + +static inline void pcommit_sfence(void) +{ + alternative(ASM_NOP7, + ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ + "sfence", + X86_FEATURE_PCOMMIT); +} + #define nop() asm volatile ("nop") diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..ea2dbe82cba3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -13,6 +13,33 @@ #include <asm/types.h> /* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + +/* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages @@ -145,7 +172,6 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define STACK_WARN (THREAD_SIZE/8) -#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) /* * macros/functions for gaining access to the thread information structure @@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_read_stable(kernel_stack) + - KERNEL_STACK_OFFSET - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } static inline unsigned long current_stack_pointer(void) @@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ -/* how to get the thread information struct from ASM */ +/* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; + _ASM_SUB $(THREAD_SIZE),reg ; /* - * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in - * a certain register (to be used in assembler memory operands). + * ASM operand which evaluates to a 'thread_info' address of + * the current task, if it is known that "reg" is exactly "off" + * bytes below the top of the stack currently. + * + * ( The kernel stack's size is known at build time, it is usually + * 2 or 4 pages, and the bottom of the kernel stack contains + * the thread_info structure. So to access the thread_info very + * quickly from assembly code we can calculate down from the + * top of the kernel stack to the bottom, using constant, + * build-time calculations only. ) + * + * For example, to fetch the current thread_info->flags value into %eax + * on x86-64 defconfig kernels, in syscall entry code where RSP is + * currently at exactly SIZEOF_PTREGS bytes away from the top of the + * stack: + * + * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax + * + * will translate to: + * + * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax + * + * which is below the current RSP by almost 16K. */ -#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) +#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) #endif @@ -236,6 +280,16 @@ static inline bool is_ia32_task(void) #endif return false; } + +/* + * Force syscall return via IRET by making it look as if there was + * some work pending. IRET is our most capable (but slowest) syscall + * return path, which is able to restore modified SS, CS and certain + * EFLAGS values that other (fast) syscall return instructions + * are not able to restore properly. + */ +#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) + #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 12a26b979bf1..f2f9b39b274a 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src, } unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); +copy_user_handle_tail(char *to, char *from, unsigned len); #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 5fa9770035dc..c9a6d68b8d62 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -82,18 +82,15 @@ static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XSAVES"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XSAVE"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); - - asm volatile(xstate_fault - : "0" (0) - : "memory"); - return err; } @@ -112,18 +109,15 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XRSTORS"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XRSTOR"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); - - asm volatile(xstate_fault - : "0" (0) - : "memory"); - return err; } @@ -149,9 +143,9 @@ static inline int xsave_state(struct xsave_struct *fx, u64 mask) */ alternative_input_2( "1:"XSAVE, - "1:"XSAVEOPT, + XSAVEOPT, X86_FEATURE_XSAVEOPT, - "1:"XSAVES, + XSAVES, X86_FEATURE_XSAVES, [fx] "D" (fx), "a" (lmask), "d" (hmask) : "memory"); @@ -178,7 +172,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask) */ alternative_input( "1: " XRSTOR, - "1: " XRSTORS, + XRSTORS, X86_FEATURE_XSAVES, "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 44e6dd7e36a2..ab456dc233b5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,7 +7,6 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 -#define SETUP_KASLR 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -16,6 +15,7 @@ /* loadflags */ #define LOADED_HIGH (1<<0) +#define KASLR_FLAG (1<<1) #define QUIET_FLAG (1<<5) #define KEEP_SEGMENTS (1<<6) #define CAN_USE_HEAP (1<<7) diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a88851..580aee3072e0 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,13 +25,17 @@ #else /* __i386__ */ #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ #define R15 0 #define R14 8 #define R13 16 #define R12 24 #define RBP 32 #define RBX 40 -/* arguments: interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ #define R11 48 #define R10 56 #define R9 64 @@ -41,15 +45,17 @@ #define RDX 96 #define RSI 104 #define RDI 112 -#define ORIG_RAX 120 /* = ERROR */ -/* end of arguments */ -/* cpu exception frame or undefined in case of fast syscall. */ +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 120 +/* Return frame for iretq */ #define RIP 128 #define CS 136 #define EFLAGS 144 #define RSP 152 #define SS 160 -#define ARGOFFSET R11 #endif /* __ASSEMBLY__ */ /* top of stack page */ diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index ac4b9aa4d999..bc16115af39b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -41,13 +41,17 @@ struct pt_regs { #ifndef __KERNEL__ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long rbp; unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -57,9 +61,12 @@ struct pt_regs { unsigned long rdx; unsigned long rsi; unsigned long rdi; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_rax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long rip; unsigned long cs; unsigned long eflags; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..16dc4e8a2cd3 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -177,9 +177,24 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * If these slots are ever needed for any other purpose, there + * is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there is + * no TLS API that works in both pre- and post-2.5.64 kernels. + */ + __u16 __pad2; /* Was gs. */ + __u16 __pad1; /* Was fs. */ + + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index c5f1a1deb91a..1fe92181ee9e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -67,6 +67,7 @@ #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 #define EXIT_REASON_PREEMPTION_TIMER 52 #define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70ddad0..c887cd944f0c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o +obj-$(CONFIG_IA32_EMULATION) += syscall_32.o obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3d525c6124f6..803b684676ff 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1338,6 +1338,26 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) } /* + * ACPI offers an alternative platform interface model that removes + * ACPI hardware requirements for platforms that do not implement + * the PC Architecture. + * + * We initialize the Hardware-reduced ACPI model here: + */ +static void __init acpi_reduced_hw_init(void) +{ + if (acpi_gbl_reduced_hardware) { + /* + * Override x86_init functions and bypass legacy pic + * in Hardware-reduced ACPI mode + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; + } +} + +/* * If your system is blacklisted here, but you find that acpi=force * works for you, please contact linux-acpi@vger.kernel.org */ @@ -1536,6 +1556,11 @@ int __init early_acpi_boot_init(void) */ early_acpi_process_madt(); + /* + * Hardware-reduced ACPI mode initialization: + */ + acpi_reduced_hw_init(); + return 0; } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..aef653193160 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, ...) \ -do { \ - if (debug_alternative) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +#define DPRINTK(fmt, args...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ +} while (0) + +#define DUMP_BYTES(buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG fmt, ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ } while (0) /* @@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void *text_poke_early(void *addr, const void *opcode, size_t len); -/* Replace instructions with better alternatives for this CPU type. - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that asymmetric systems where - APs have less capabilities than the boot processor are not handled. - Tough. Make sure you disable such features by hand. */ +/* + * Are we looking at a near JMP with a 1 or 4-byte displacement. + */ +static inline bool is_jmp(const u8 opcode) +{ + return opcode == 0xeb || opcode == 0xe9; +} + +static void __init_or_module +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +{ + u8 *next_rip, *tgt_rip; + s32 n_dspl, o_dspl; + int repl_len; + + if (a->replacementlen != 5) + return; + + o_dspl = *(s32 *)(insnbuf + 1); + + /* next_rip of the replacement JMP */ + next_rip = repl_insn + a->replacementlen; + /* target rip of the replacement JMP */ + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + + DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) + goto two_byte_jmp; + else + goto five_byte_jmp; + /* negative offset */ + } else { + if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) + goto two_byte_jmp; + else + goto five_byte_jmp; + } + +two_byte_jmp: + n_dspl -= 2; + + insnbuf[0] = 0xeb; + insnbuf[1] = (s8)n_dspl; + add_nops(insnbuf + 2, 3); + + repl_len = 2; + goto done; + +five_byte_jmp: + n_dspl -= 5; + + insnbuf[0] = 0xe9; + *(s32 *)&insnbuf[1] = n_dspl; + repl_len = 5; + +done: + + DPRINTK("final displ: 0x%08x, JMP 0x%lx", + n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); +} + +static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +{ + if (instr[0] != 0x90) + return; + + add_nops(instr + (a->instrlen - a->padlen), a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + instr, a->instrlen - a->padlen, a->padlen); +} + +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + */ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { @@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + DPRINTK("alt table %p -> %p", start, end); /* * The scan order should be from start to end. A later scanned - * alternative code can overwrite a previous scanned alternative code. + * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * @@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { + int insnbuf_sz = 0; + instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) + if (!boot_cpu_has(a->cpuid)) { + if (a->padlen > 1) + optimize_nops(a, instr); + continue; + } + + DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, a->instrlen, + replacement, a->replacementlen, a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += replacement - instr; + if (*insnbuf == 0xe8 && a->replacementlen == 5) { + *(s32 *)(insnbuf + 1) += replacement - instr; + DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + *(s32 *)(insnbuf + 1), + (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + } + + if (a->replacementlen && is_jmp(replacement[0])) + recompute_jump(a, instr, replacement, insnbuf); - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); + if (a->instrlen > a->replacementlen) { + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); - text_poke_early(instr, insnbuf, a->instrlen); + text_poke_early(instr, insnbuf, insnbuf_sz); } } #ifdef CONFIG_SMP - static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { @@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", - __func__, smp->locks, smp->locks_end, + DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); @@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end) return 0; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT void __init_or_module apply_paravirt(struct paravirt_patch_site *start, @@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs) if (likely(!bp_patching_in_progress)) return 0; - if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) return 0; /* set up the specified breakpoint handler */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad3639ae1b9b..dcb52850a28f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1084,67 +1084,6 @@ void lapic_shutdown(void) local_irq_restore(flags); } -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ apic->apic_id_mask); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ apic->apic_id_mask)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - /** * sync_Arb_IDs - synchronize APIC bus arbitration IDs */ @@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void) disable_ioapic_support(); default_setup_apic_routing(); - verify_local_APIC(); apic_bsp_setup(true); return 0; } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c2fd21fed002..017149cded07 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -37,10 +37,12 @@ static const struct apic apic_numachip; static unsigned int get_apic_id(unsigned long x) { unsigned long value; - unsigned int id; + unsigned int id = (x >> 24) & 0xff; - rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U); + if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, value); + id |= (value << 2) & 0xff00; + } return id; } @@ -155,10 +157,18 @@ static int __init numachip_probe(void) static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - if (c->phys_proc_id != node) { - c->phys_proc_id = node; - per_cpu(cpu_llc_id, smp_processor_id()) = node; + u64 val; + u32 nodes = 1; + + this_cpu_write(cpu_llc_id, node); + + /* Account for nodes per socket in multi-core-module processors */ + if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, val); + nodes = ((val >> 3) & 7) + 1; } + + c->phys_proc_id = node / nodes; } static int __init numachip_system_init(void) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e658f21681c8..d9d0bd2faaf4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -135,12 +135,12 @@ static void init_x2apic_ldr(void) per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); - __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); for_each_online_cpu(cpu) { if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) continue; - __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); - __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); } } @@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void) BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); - __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); register_hotcpu_notifier(&x2apic_cpu_notifier); return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 8e9dcfd630e4..c8d92950bc04 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int pnodeid, is_uv1, is_uv2, is_uv3; - - is_uv1 = !strcmp(oem_id, "SGI"); - is_uv2 = !strcmp(oem_id, "SGI2"); - is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ - if (is_uv1 || is_uv2 || is_uv3) { - uv_hub_info->hub_revision = - (is_uv1 ? UV1_HUB_REVISION_BASE : - (is_uv2 ? UV2_HUB_REVISION_BASE : - UV3_HUB_REVISION_BASE)); - pnodeid = early_get_pnodeid(); - early_get_apic_pnode_shift(); - x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; - x86_platform.nmi_init = uv_nmi_init; - if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - else if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - else if (!strcmp(oem_table_id, "UVH")) { - __this_cpu_write(x2apic_extra_bits, - pnodeid << uvh_apicid.s.pnode_shift); - uv_system_type = UV_NON_UNIQUE_APIC; - uv_set_apicid_hibit(); - return 1; - } + int pnodeid; + int uv_apic; + + if (strncmp(oem_id, "SGI", 3) != 0) + return 0; + + /* + * Determine UV arch type. + * SGI: UV100/1000 + * SGI2: UV2000/3000 + * SGI3: UV300 (truncated to 4 chars because of different varieties) + */ + uv_hub_info->hub_revision = + !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : + !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; + + if (uv_hub_info->hub_revision == 0) + goto badbios; + + pnodeid = early_get_pnodeid(); + early_get_apic_pnode_shift(); + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; + + if (!strcmp(oem_table_id, "UVX")) { /* most common */ + uv_system_type = UV_X2APIC; + uv_apic = 0; + + } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */ + uv_system_type = UV_NON_UNIQUE_APIC; + __this_cpu_write(x2apic_extra_bits, + pnodeid << uvh_apicid.s.pnode_shift); + uv_set_apicid_hibit(); + uv_apic = 1; + + } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */ + uv_system_type = UV_LEGACY_APIC; /* very small systems */ + uv_apic = 0; + + } else { + goto badbios; } - return 0; + + pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", + oem_id, oem_table_id, uv_system_type, + uv_min_hub_revision_id, uv_apic); + + return uv_apic; + +badbios: + pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); + pr_err("Current BIOS not supported, update kernel and/or BIOS\n"); + BUG(); } enum uv_system_type get_uv_system_type(void) @@ -854,10 +881,14 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; unsigned char n_lshift; - char *hub = (is_uv1_hub() ? "UV1" : - (is_uv2_hub() ? "UV2" : - "UV3")); + char *hub = (is_uv1_hub() ? "UV100/1000" : + (is_uv2_hub() ? "UV2000/3000" : + (is_uv3_hub() ? "UV300" : NULL))); + if (!hub) { + pr_err("UV: Unknown/unsupported UV hub\n"); + return; + } pr_info("UV: Found %s hub\n", hub); map_low_mmrs(); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -68,7 +68,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - sizeof(struct tss_struct)); + offsetofend(struct tss_struct, SYSENTER_stack)); #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d27c9f..5ce6f2da8763 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -81,6 +81,7 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..fd470ebf924e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include <linux/io.h> #include <linux/sched.h> +#include <linux/random.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> @@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.mask = (upperbit - 1) & PAGE_MASK; va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + /* A random value per boot for bit slice [12:upper_bit) */ + va_align.bits = get_random_int() & va_align.mask; } } @@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* 3DNow or LM implies PREFETCHW */ + if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) + if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) + set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b5c8ff5e9dfc..3f70538012e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -959,38 +959,37 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif } -#ifdef CONFIG_X86_64 -#ifdef CONFIG_IA32_EMULATION -/* May not be __init: called during resume */ -static void syscall32_cpu_init(void) -{ - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} -#endif /* CONFIG_IA32_EMULATION */ -#endif /* CONFIG_X86_64 */ - +/* + * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions + * on 32-bit kernels: + */ #ifdef CONFIG_X86_32 void enable_sep_cpu(void) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss; + int cpu; - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } + cpu = get_cpu(); + tss = &per_cpu(cpu_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) + goto out; + + /* + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- + * see the big comment in struct x86_hw_tss's definition. + */ tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + + wrmsr(MSR_IA32_SYSENTER_ESP, + (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), + 0); + + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + +out: put_cpu(); } #endif @@ -1118,7 +1117,7 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; + (unsigned long)&init_thread_union + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); #ifdef CONFIG_X86_64 @@ -1130,8 +1129,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; /* - * The following four percpu variables are hot. Align current_task to - * cacheline size such that all four fall in the same cacheline. + * The following percpu variables are hot. Align current_task to + * cacheline size such that they fall in the same cacheline. */ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; @@ -1171,10 +1170,23 @@ void syscall_init(void) */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); #ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); + wrmsrl(MSR_CSTAR, ia32_cstar_target); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); +#else + wrmsrl(MSR_CSTAR, ignore_sysret); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif /* Flags to clear on syscall */ @@ -1226,6 +1238,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +/* + * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find + * the top of the kernel stack. Use an extra percpu variable to track the + * top of the kernel stack directly. + */ +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = + (unsigned long)&init_thread_union + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif @@ -1307,7 +1328,7 @@ void cpu_init(void) */ load_ucode_ap(); - t = &per_cpu(init_tss, cpu); + t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1391,11 +1412,17 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); + /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); + show_ucode_info_early(); printk(KERN_INFO "Initializing CPU#%d\n", cpu); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 94d7dcb12145..50163fa9034f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -565,8 +565,8 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 659643376dbf..edcb0e28c336 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -7,16 +7,14 @@ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ -#include <linux/init.h> #include <linux/slab.h> -#include <linux/device.h> -#include <linux/compiler.h> +#include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/sysfs.h> #include <linux/pci.h> #include <asm/processor.h> -#include <linux/smp.h> #include <asm/amd_nb.h> #include <asm/smp.h> @@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] = enum _cache_type { - CACHE_TYPE_NULL = 0, - CACHE_TYPE_DATA = 1, - CACHE_TYPE_INST = 2, - CACHE_TYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { @@ -159,11 +157,6 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; }; -struct _cpuid4_info { - struct _cpuid4_info_regs base; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); -}; - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -220,6 +213,13 @@ static const unsigned short assocs[] = { static const unsigned char levels[] = { 1, 1, 2, 3 }; static const unsigned char types[] = { 1, 2, 3, 3 }; +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + static void amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, - unsigned int); -}; - #if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) + /* * L3 cache descriptors */ @@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) -{ - int node; - - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; - - node = amd_get_nb_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); -} - /* * check whether a slot used for disabling an L3 index is occupied. * @l3: L3 cache descriptor @@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) return -1; } -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, +static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, unsigned int slot) { int index; + struct amd_northbridge *nb = this_leaf->priv; - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return -EINVAL; - - index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); + index = amd_get_l3_disable_slot(nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, #define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ - unsigned int cpu) \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ { \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ return show_cache_disable(this_leaf, buf, slot); \ } SHOW_CACHE_DISABLE(0) @@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, return 0; } -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, - unsigned int slot) +static ssize_t store_cache_disable(struct cacheinfo *this_leaf, + const char *buf, size_t count, + unsigned int slot) { unsigned long val = 0; int cpu, err = 0; + struct amd_northbridge *nb = this_leaf->priv; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return -EINVAL; - - cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + cpu = cpumask_first(&this_leaf->shared_cpu_map); if (kstrtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); + err = amd_set_l3_disable_slot(nb, cpu, slot, val); if (err) { if (err == -EEXIST) pr_warning("L3 slot %d in use/index already disabled!\n", @@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ -store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count, \ - unsigned int cpu) \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ { \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ return store_cache_disable(this_leaf, buf, count, slot); \ } STORE_CACHE_DISABLE(0) STORE_CACHE_DISABLE(1) -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); - -static ssize_t -show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) +static ssize_t subcaches_show(struct device *dev, + struct device_attribute *attr, char *buf) { - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return -EINVAL; + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); } -static ssize_t -store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, - unsigned int cpu) +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); unsigned long val; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return -EINVAL; - if (kstrtoul(buf, 16, &val) < 0) return -EINVAL; @@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, return count; } -static struct _cache_attr subcaches = - __ATTR(subcaches, 0644, show_subcaches, store_subcaches); +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t +cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!this_leaf->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + int n = 1; + static struct attribute **amd_l3_attrs; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group * +cache_get_priv_group(struct cacheinfo *this_leaf) +{ + struct amd_northbridge *nb = this_leaf->priv; + + if (this_leaf->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return; + + node = amd_get_nb_id(smp_processor_id()); + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); +} #else #define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ @@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } - if (eax.split.type == CACHE_TYPE_NULL) + if (eax.split.type == CTYPE_NULL) return -EIO; /* better error ? */ this_leaf->eax = eax; @@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; - } while (cache_eax.split.type != CACHE_TYPE_NULL); + } while (cache_eax.split.type != CTYPE_NULL); return i; } @@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) switch (this_leaf.eax.split.level) { case 1: - if (this_leaf.eax.split.type == CACHE_TYPE_DATA) + if (this_leaf.eax.split.type == CTYPE_DATA) new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CACHE_TYPE_INST) + else if (this_leaf.eax.split.type == CTYPE_INST) new_l1i = this_leaf.size/1024; break; case 2: @@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) return l2; } -#ifdef CONFIG_SYSFS - -/* pointer to _cpuid4_info array (for each cache leaf) */ -static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) - -#ifdef CONFIG_SMP - -static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) +static int __cache_amd_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) { - struct _cpuid4_info *this_leaf; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf; int i, sibling; if (cpu_has_topoext) { unsigned int apicid, nshared, first, last; - if (!per_cpu(ici_cpuid4_info, cpu)) - return 0; - - this_leaf = CPUID4_INFO_IDX(cpu, index); - nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + this_leaf = this_cpu_ci->info_list + index; + nshared = base->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; for_each_online_cpu(i) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + apicid = cpu_data(i).apicid; if ((apicid < first) || (apicid > last)) continue; - if (!per_cpu(ici_cpuid4_info, i)) - continue; - this_leaf = CPUID4_INFO_IDX(i, index); + + this_leaf = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).apicid; if ((apicid < first) || (apicid > last)) continue; - set_bit(sibling, this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); } } } else if (index == 3) { for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - if (!per_cpu(ici_cpuid4_info, i)) + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) continue; - this_leaf = CPUID4_INFO_IDX(i, index); + this_leaf = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - set_bit(sibling, this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); } } } else @@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) return 1; } -static void cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void __cache_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) { - struct _cpuid4_info *this_leaf, *sibling_leaf; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->x86_vendor == X86_VENDOR_AMD) { - if (cache_shared_amd_cpu_map_setup(cpu, index)) + if (__cache_amd_cpumap_setup(cpu, index, base)) return; } - this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; + this_leaf = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); if (num_threads_sharing == 1) - cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); - else { - index_msb = get_count_order(num_threads_sharing); - - for_each_online_cpu(i) { - if (cpu_data(i).apicid >> index_msb == - c->apicid >> index_msb) { - cpumask_set_cpu(i, - to_cpumask(this_leaf->shared_cpu_map)); - if (i != cpu && per_cpu(ici_cpuid4_info, i)) { - sibling_leaf = - CPUID4_INFO_IDX(i, index); - cpumask_set_cpu(cpu, to_cpumask( - sibling_leaf->shared_cpu_map)); - } - } - } - } -} -static void cache_remove_shared_cpu_map(unsigned int cpu, int index) -{ - struct _cpuid4_info *this_leaf, *sibling_leaf; - int sibling; - - this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) { - sibling_leaf = CPUID4_INFO_IDX(sibling, index); - cpumask_clear_cpu(cpu, - to_cpumask(sibling_leaf->shared_cpu_map)); - } -} -#else -static void cache_shared_cpu_map_setup(unsigned int cpu, int index) -{ -} - -static void cache_remove_shared_cpu_map(unsigned int cpu, int index) -{ -} -#endif - -static void free_cache_attributes(unsigned int cpu) -{ - int i; - - for (i = 0; i < num_cache_leaves; i++) - cache_remove_shared_cpu_map(cpu, i); - - kfree(per_cpu(ici_cpuid4_info, cpu)); - per_cpu(ici_cpuid4_info, cpu) = NULL; -} - -static void get_cpu_leaves(void *_retval) -{ - int j, *retval = _retval, cpu = smp_processor_id(); + return; - /* Do cpuid and store the results */ - for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + index_msb = get_count_order(num_threads_sharing); - *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); - if (unlikely(*retval < 0)) { - int i; + for_each_online_cpu(i) + if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); - for (i = 0; i < j; i++) - cache_remove_shared_cpu_map(cpu, i); - break; + if (i == cpu || !sib_cpu_ci->info_list) + continue;/* skip if itself or no cacheinfo */ + sibling_leaf = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); } - cache_shared_cpu_map_setup(cpu, j); - } } -static int detect_cache_attributes(unsigned int cpu) +static void ci_leaf_init(struct cacheinfo *this_leaf, + struct _cpuid4_info_regs *base) { - int retval; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(ici_cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(ici_cpuid4_info, cpu) == NULL) - return -ENOMEM; - - smp_call_function_single(cpu, get_cpu_leaves, &retval, true); - if (retval) { - kfree(per_cpu(ici_cpuid4_info, cpu)); - per_cpu(ici_cpuid4_info, cpu) = NULL; - } - - return retval; + this_leaf->level = base->eax.split.level; + this_leaf->type = cache_type_map[base->eax.split.type]; + this_leaf->coherency_line_size = + base->ebx.split.coherency_line_size + 1; + this_leaf->ways_of_associativity = + base->ebx.split.ways_of_associativity + 1; + this_leaf->size = base->size; + this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; + this_leaf->physical_line_partition = + base->ebx.split.physical_line_partition + 1; + this_leaf->priv = base->nb; } -#include <linux/kobject.h> -#include <linux/sysfs.h> -#include <linux/cpu.h> - -/* pointer to kobject for cpuX/cache */ -static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); - -struct _index_kobject { - struct kobject kobj; - unsigned int cpu; - unsigned short index; -}; - -/* pointer to array of kobjects for cpuX/cache/indexY */ -static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) - -#define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ - unsigned int cpu) \ -{ \ - return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ -} - -show_one_plus(level, base.eax.split.level, 0); -show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); - -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, - unsigned int cpu) -{ - return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); -} - -static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, - int type, char *buf) -{ - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int ret; - - if (type) - ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", - cpumask_pr_args(mask)); - else - ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb", - cpumask_pr_args(mask)); - buf[ret++] = '\n'; - buf[ret] = '\0'; - return ret; -} - -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, - unsigned int cpu) +static int __init_cache_level(unsigned int cpu) { - return show_shared_cpu_map_func(leaf, 0, buf); -} - -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, - unsigned int cpu) -{ - return show_shared_cpu_map_func(leaf, 1, buf); -} + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, - unsigned int cpu) -{ - switch (this_leaf->base.eax.split.type) { - case CACHE_TYPE_DATA: - return sprintf(buf, "Data\n"); - case CACHE_TYPE_INST: - return sprintf(buf, "Instruction\n"); - case CACHE_TYPE_UNIFIED: - return sprintf(buf, "Unified\n"); - default: - return sprintf(buf, "Unknown\n"); - } -} - -#define to_object(k) container_of(k, struct _index_kobject, kobj) -#define to_attr(a) container_of(a, struct _cache_attr, attr) - -#define define_one_ro(_name) \ -static struct _cache_attr _name = \ - __ATTR(_name, 0444, show_##_name, NULL) - -define_one_ro(level); -define_one_ro(type); -define_one_ro(coherency_line_size); -define_one_ro(physical_line_partition); -define_one_ro(ways_of_associativity); -define_one_ro(number_of_sets); -define_one_ro(size); -define_one_ro(shared_cpu_map); -define_one_ro(shared_cpu_list); - -static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, - NULL -}; - -#ifdef CONFIG_AMD_NB -static struct attribute **amd_l3_attrs(void) -{ - static struct attribute **attrs; - int n; - - if (attrs) - return attrs; - - n = ARRAY_SIZE(default_attrs); - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); - if (attrs == NULL) - return attrs = default_attrs; - - for (n = 0; default_attrs[n]; n++) - attrs[n] = default_attrs[n]; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - attrs[n++] = &cache_disable_0.attr; - attrs[n++] = &cache_disable_1.attr; - } - - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - attrs[n++] = &subcaches.attr; - - return attrs; -} -#endif - -static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct _cache_attr *fattr = to_attr(attr); - struct _index_kobject *this_leaf = to_object(kobj); - ssize_t ret; - - ret = fattr->show ? - fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, this_leaf->cpu) : - 0; - return ret; -} - -static ssize_t store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct _cache_attr *fattr = to_attr(attr); - struct _index_kobject *this_leaf = to_object(kobj); - ssize_t ret; - - ret = fattr->store ? - fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count, this_leaf->cpu) : - 0; - return ret; -} - -static const struct sysfs_ops sysfs_ops = { - .show = show, - .store = store, -}; - -static struct kobj_type ktype_cache = { - .sysfs_ops = &sysfs_ops, - .default_attrs = default_attrs, -}; - -static struct kobj_type ktype_percpu_entry = { - .sysfs_ops = &sysfs_ops, -}; - -static void cpuid4_cache_sysfs_exit(unsigned int cpu) -{ - kfree(per_cpu(ici_cache_kobject, cpu)); - kfree(per_cpu(ici_index_kobject, cpu)); - per_cpu(ici_cache_kobject, cpu) = NULL; - per_cpu(ici_index_kobject, cpu) = NULL; - free_cache_attributes(cpu); -} - -static int cpuid4_cache_sysfs_init(unsigned int cpu) -{ - int err; - - if (num_cache_leaves == 0) + if (!num_cache_leaves) return -ENOENT; - - err = detect_cache_attributes(cpu); - if (err) - return err; - - /* Allocate all required memory */ - per_cpu(ici_cache_kobject, cpu) = - kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) - goto err_out; - - per_cpu(ici_index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); - if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) - goto err_out; - + if (!this_cpu_ci) + return -EINVAL; + this_cpu_ci->num_levels = 3; + this_cpu_ci->num_leaves = num_cache_leaves; return 0; - -err_out: - cpuid4_cache_sysfs_exit(cpu); - return -ENOMEM; } -static DECLARE_BITMAP(cache_dev_map, NR_CPUS); - -/* Add/Remove cache interface for CPU device */ -static int cache_add_dev(struct device *dev) +static int __populate_cache_leaves(unsigned int cpu) { - unsigned int cpu = dev->id; - unsigned long i, j; - struct _index_kobject *this_object; - struct _cpuid4_info *this_leaf; - int retval; - - retval = cpuid4_cache_sysfs_init(cpu); - if (unlikely(retval < 0)) - return retval; - - retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), - &ktype_percpu_entry, - &dev->kobj, "%s", "cache"); - if (retval < 0) { - cpuid4_cache_sysfs_exit(cpu); - return retval; - } + unsigned int idx, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct _cpuid4_info_regs id4_regs = {}; - for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu, i); - this_object->cpu = cpu; - this_object->index = i; - - this_leaf = CPUID4_INFO_IDX(cpu, i); - - ktype_cache.default_attrs = default_attrs; -#ifdef CONFIG_AMD_NB - if (this_leaf->base.nb) - ktype_cache.default_attrs = amd_l3_attrs(); -#endif - retval = kobject_init_and_add(&(this_object->kobj), - &ktype_cache, - per_cpu(ici_cache_kobject, cpu), - "index%1lu", i); - if (unlikely(retval)) { - for (j = 0; j < i; j++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); - kobject_put(per_cpu(ici_cache_kobject, cpu)); - cpuid4_cache_sysfs_exit(cpu); - return retval; - } - kobject_uevent(&(this_object->kobj), KOBJ_ADD); + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + if (ret) + return ret; + ci_leaf_init(this_leaf++, &id4_regs); + __cache_cpumap_setup(cpu, idx, &id4_regs); } - cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); - - kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); return 0; } -static void cache_remove_dev(struct device *dev) -{ - unsigned int cpu = dev->id; - unsigned long i; - - if (per_cpu(ici_cpuid4_info, cpu) == NULL) - return; - if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) - return; - cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); - - for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); - kobject_put(per_cpu(ici_cache_kobject, cpu)); - cpuid4_cache_sysfs_exit(cpu); -} - -static int cacheinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - - dev = get_cpu_device(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cache_add_dev(dev); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cache_remove_dev(dev); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cacheinfo_cpu_notifier = { - .notifier_call = cacheinfo_cpu_callback, -}; - -static int __init cache_sysfs_init(void) -{ - int i, err = 0; - - if (num_cache_leaves == 0) - return 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - struct device *dev = get_cpu_device(i); - - err = cache_add_dev(dev); - if (err) - goto out; - } - __register_hotcpu_notifier(&cacheinfo_cpu_notifier); - -out: - cpu_notifier_register_done(); - return err; -} - -device_initcall(cache_sysfs_init); - -#endif +DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) +DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 10b46906767f..fe32074b865b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -14,6 +14,7 @@ enum severity_level { }; #define ATTR_LEN 16 +#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ /* One object for each MCE bank, shared by all CPUs */ struct mce_bank { @@ -23,20 +24,20 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long mce_intel_adjust_timer(unsigned long interval); -void mce_intel_cmci_poll(void); +unsigned long cmci_intel_adjust_timer(unsigned long interval); +bool mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); #else -# define mce_intel_adjust_timer mce_adjust_timer_default -static inline void mce_intel_cmci_poll(void) { } +# define cmci_intel_adjust_timer mce_adjust_timer_default +static inline bool mce_intel_cmci_poll(void) { return false; } static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8bb433043a7f..9c682c222071 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -186,7 +186,61 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) +/* + * See AMD Error Scope Hierarchy table in a newer BKDG. For example + * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" + */ +static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +{ + enum context ctx = error_context(m); + + /* Processor Context Corrupt, no need to fumble too much, die! */ + if (m->status & MCI_STATUS_PCC) + return MCE_PANIC_SEVERITY; + + if (m->status & MCI_STATUS_UC) { + + /* + * On older systems where overflow_recov flag is not present, we + * should simply panic if an error overflow occurs. If + * overflow_recov flag is present and set, then software can try + * to at least kill process to prolong system operation. + */ + if (mce_flags.overflow_recov) { + /* software can try to contain */ + if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) + return MCE_PANIC_SEVERITY; + + /* kill current process */ + return MCE_AR_SEVERITY; + } else { + /* at least one error was not logged */ + if (m->status & MCI_STATUS_OVER) + return MCE_PANIC_SEVERITY; + } + + /* + * For any other case, return MCE_UC_SEVERITY so that we log the + * error and exit #MC handler. + */ + return MCE_UC_SEVERITY; + } + + /* + * deferred error: poll handler catches these and adds to mce_ring so + * memory-failure can take recovery actions. + */ + if (m->status & MCI_STATUS_DEFERRED) + return MCE_DEFERRED_SEVERITY; + + /* + * corrected error: poll handler catches these and passes responsibility + * of decoding the error to EDAC + */ + return MCE_KEEP_SEVERITY; +} + +static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); @@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) } } +/* Default to mce_severity_intel */ +int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = + mce_severity_intel; + +void __init mcheck_vendor_init_severity(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + mce_severity = mce_severity_amd; +} + #ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3c036cb4a370..e535533d5ab8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -#define SPINUNIT 100 /* 100ns */ +#define SPINUNIT 100 /* 100ns */ DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; +struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { .bootlog = -1, @@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* CMCI storm detection filter */ -static DEFINE_PER_CPU(unsigned long, mce_polled_error); - /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { + bool error_logged = false; struct mce m; int severity; int i; @@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; - this_cpu_write(mce_polled_error, 1); + /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) { + error_logged = true; mce_log(&m); + } /* * Clear state for this bank. @@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ sync_core(); + + return error_logged; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -813,7 +816,7 @@ static void mce_reign(void) * other CPUs. */ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Fatal Machine check", m, msg); + mce_panic("Fatal machine check", m, msg); /* * For UC somewhere we let the CPU who detects it handle it. @@ -826,7 +829,7 @@ static void mce_reign(void) * source or one CPU is hung. Panic. */ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Machine check from unknown source", NULL, NULL); + mce_panic("Fatal machine check from unknown source", NULL, NULL); /* * Now clear all the mces_seen so that they don't reappear on @@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static unsigned long check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); @@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) return interval; } -static unsigned long (*mce_adjust_timer)(unsigned long interval) = - mce_adjust_timer_default; +static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; -static int cmc_error_seen(void) +static void __restart_timer(struct timer_list *t, unsigned long interval) { - unsigned long *v = this_cpu_ptr(&mce_polled_error); + unsigned long when = jiffies + interval; + unsigned long flags; + + local_irq_save(flags); - return test_and_clear_bit(0, v); + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + + local_irq_restore(flags); } static void mce_timer_fn(unsigned long data) { struct timer_list *t = this_cpu_ptr(&mce_timer); + int cpu = smp_processor_id(); unsigned long iv; - int notify; - WARN_ON(smp_processor_id() != data); + WARN_ON(cpu != data); + + iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(MCP_TIMESTAMP, - this_cpu_ptr(&mce_poll_banks)); - mce_intel_cmci_poll(); + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); + + if (mce_intel_cmci_poll()) { + iv = mce_adjust_timer(iv); + goto done; + } } /* - * Alert userspace if needed. If we logged an MCE, reduce the - * polling interval, otherwise increase the polling interval. + * Alert userspace if needed. If we logged an MCE, reduce the polling + * interval, otherwise increase the polling interval. */ - iv = __this_cpu_read(mce_next_interval); - notify = mce_notify_irq(); - notify |= cmc_error_seen(); - if (notify) { + if (mce_notify_irq()) iv = max(iv / 2, (unsigned long) HZ/100); - } else { + else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); - iv = mce_adjust_timer(iv); - } + +done: __this_cpu_write(mce_next_interval, iv); - /* Might have become 0 after CMCI storm subsided */ - if (iv) { - t->expires = jiffies + iv; - add_timer_on(t, smp_processor_id()); - } + __restart_timer(t, iv); } /* @@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data) void mce_timer_kick(unsigned long interval) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long when = jiffies + interval; unsigned long iv = __this_cpu_read(mce_next_interval); - if (timer_pending(t)) { - if (time_before(when, t->expires)) - mod_timer_pinned(t, when); - } else { - t->expires = round_jiffies(when); - add_timer_on(t, smp_processor_id()); - } + __restart_timer(t, interval); + if (interval < iv) __this_cpu_write(mce_next_interval, interval); } @@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && cfg->banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; - /* - * Turn off MC4_MISC thresholding banks on those models since - * they're not supported there. - */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { - int i; - u64 val, hwcr; - bool need_toggle; - u32 msrs[] = { + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 hwcr; + bool need_toggle; + u32 msrs[] = { 0x00000413, /* MC4_MISC0 */ 0xc0000408, /* MC4_MISC1 */ - }; + }; - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrl(MSR_K7_HWCR, hwcr); - /* McStatusWrEn has to be set */ - need_toggle = !(hwcr & BIT(18)); + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); - for (i = 0; i < ARRAY_SIZE(msrs); i++) { - rdmsrl(msrs[i], val); + /* Clear CntP bit safely */ + for (i = 0; i < ARRAY_SIZE(msrs); i++) + msr_clear_bit(msrs[i], 62); - /* CntP bit set? */ - if (val & BIT_64(62)) { - val &= ~BIT_64(62); - wrmsrl(msrs[i], val); - } - } - - /* restore old settings */ - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); - } + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = mce_intel_adjust_timer; + mce_adjust_timer = cmci_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); + mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; break; default: break; @@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mcheck_vendor_init_severity(); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f1c3769bbd64..55ad9b37cae8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank) return (bank == 4); } -static const char * const bank4_names(struct threshold_block *b) +static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { /* MSR4_MISC0 */ @@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!b.interrupt_capable) goto init; + b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; offset = setup_APIC_mce(offset, new); @@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void) log: mce_setup(&m); rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); + if (!(m.status & MCI_STATUS_VAL)) + return; m.misc = ((u64)high << 32) | low; m.bank = bank; mce_log(&m); @@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; - if (b->interrupt_capable) + if (b->interrupt_capable) { threshold_ktype.default_attrs[2] = &interrupt_enable.attr; - else + b->interrupt_enable = 1; + } else { threshold_ktype.default_attrs[2] = NULL; + } INIT_LIST_HEAD(&b->miscj); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b3c97bafc123..b4a41cf030ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -39,6 +39,15 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* + * CMCI storm detection backoff counter + * + * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've + * encountered an error. If not, we decrement it by one. We signal the end of + * the CMCI storm when it reaches 0. + */ +static DEFINE_PER_CPU(int, cmci_backoff_cnt); + +/* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ @@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_INTERVAL (HZ) #define CMCI_STORM_THRESHOLD 15 static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); @@ -82,11 +91,21 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } -void mce_intel_cmci_poll(void) +bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + return false; + + /* + * Reset the counter if we've logged an error in the last poll + * during the storm. + */ + if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + else + this_cpu_dec(cmci_backoff_cnt); + + return true; } void mce_intel_hcpu_update(unsigned long cpu) @@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu) per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; } -unsigned long mce_intel_adjust_timer(unsigned long interval) +unsigned long cmci_intel_adjust_timer(unsigned long interval) { - int r; - - if (interval < CMCI_POLL_INTERVAL) - return interval; + if ((this_cpu_read(cmci_backoff_cnt) > 0) && + (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { + mce_notify_irq(); + return CMCI_STORM_INTERVAL; + } switch (__this_cpu_read(cmci_storm_state)) { case CMCI_STORM_ACTIVE: + /* * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the - * timer interval is back to our poll interval. + * silenced itself. That means no events recorded and the timer + * interval is back to our poll interval. */ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - r = atomic_sub_return(1, &cmci_storm_on_cpus); - if (r == 0) + if (!atomic_sub_return(1, &cmci_storm_on_cpus)) pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ case CMCI_STORM_SUBSIDED: /* - * We wait for all cpus to go back to SUBSIDED - * state. When that happens we switch back to - * interrupt mode. + * We wait for all CPUs to go back to SUBSIDED state. When that + * happens we switch back to interrupt mode. */ if (!atomic_read(&cmci_storm_on_cpus)) { __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); @@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } return CMCI_POLL_INTERVAL; default: - /* - * We have shiny weather. Let the poll do whatever it - * thinks. - */ + + /* We have shiny weather. Let the poll do whatever it thinks. */ return interval; } } @@ -178,7 +196,8 @@ static bool cmci_storm_detect(void) cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_POLL_INTERVAL); + mce_timer_kick(CMCI_STORM_INTERVAL); + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); if (r == 1) pr_notice("CMCI storm detected: switching to poll mode\n"); @@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void) { if (cmci_storm_detect()) return; + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); mce_notify_irq(); } @@ -286,6 +306,7 @@ void cmci_recheck(void) if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; + local_irq_save(flags); machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index bfbbe6195e2d..12829c3ced3c 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -21,7 +21,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/firmware.h> -#include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/kernel.h> diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index d45df4bd16ab..a413a69cbd74 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -23,57 +23,6 @@ #include <asm/processor.h> #include <asm/cmdline.h> -#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) -#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') -#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') -#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') -#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') -#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') -#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') - -#define CPUID_IS(a, b, c, ebx, ecx, edx) \ - (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) - -/* - * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_vendor() gets vendor id for BSP. - * - * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_vendor() to get vendor id for AP. - * - * x86_vendor() gets vendor information directly through cpuid. - */ -static int x86_vendor(void) -{ - u32 eax = 0x00000000; - u32 ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) - return X86_VENDOR_INTEL; - - if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) - return X86_VENDOR_AMD; - - return X86_VENDOR_UNKNOWN; -} - -static int x86_family(void) -{ - u32 eax = 0x00000001; - u32 ebx, ecx = 0, edx; - int x86; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - x86 = (eax >> 8) & 0xf; - if (x86 == 15) - x86 += (eax >> 20) & 0xff; - - return x86; -} - static bool __init check_loader_disabled_bsp(void) { #ifdef CONFIG_X86_32 @@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void) void __init load_ucode_bsp(void) { - int vendor, x86; + int vendor, family; if (check_loader_disabled_bsp()) return; @@ -105,15 +54,15 @@ void __init load_ucode_bsp(void) return; vendor = x86_vendor(); - x86 = x86_family(); + family = x86_family(); switch (vendor) { case X86_VENDOR_INTEL: - if (x86 >= 6) + if (family >= 6) load_ucode_intel_bsp(); break; case X86_VENDOR_AMD: - if (x86 >= 0x10) + if (family >= 0x10) load_ucode_amd_bsp(); break; default: @@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void) void load_ucode_ap(void) { - int vendor, x86; + int vendor, family; if (check_loader_disabled_ap()) return; @@ -141,15 +90,15 @@ void load_ucode_ap(void) return; vendor = x86_vendor(); - x86 = x86_family(); + family = x86_family(); switch (vendor) { case X86_VENDOR_INTEL: - if (x86 >= 6) + if (family >= 6) load_ucode_intel_ap(); break; case X86_VENDOR_AMD: - if (x86 >= 0x10) + if (family >= 0x10) load_ucode_amd_ap(); break; default: @@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void) void reload_early_microcode(void) { - int vendor, x86; + int vendor, family; vendor = x86_vendor(); - x86 = x86_family(); + family = x86_family(); switch (vendor) { case X86_VENDOR_INTEL: - if (x86 >= 6) + if (family >= 6) reload_ucode_intel(); break; case X86_VENDOR_AMD: - if (x86 >= 0x10) + if (family >= 0x10) reload_ucode_amd(); break; default: diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 746e7fd08aad..a41beadb3db9 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return get_matching_microcode(csig, cpf, mc_intel, crev); + return get_matching_microcode(csig, cpf, crev, mc_intel); } static int apply_microcode_intel(int cpu) @@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, csig = uci->cpu_sig.sig; cpf = uci->cpu_sig.pf; - if (get_matching_microcode(csig, cpf, mc, new_rev)) { + if (get_matching_microcode(csig, cpf, new_rev, mc)) { vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 420eb933189c..2f49ab4ac0ae 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -16,6 +16,14 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +/* + * This needs to be before all headers so that pr_debug in printk.h doesn't turn + * printk calls into no_printk(). + * + *#define DEBUG + */ + #include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> @@ -28,6 +36,9 @@ #include <asm/tlbflush.h> #include <asm/setup.h> +#undef pr_fmt +#define pr_fmt(fmt) "microcode: " fmt + static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; static struct mc_saved_data { unsigned int mc_saved_count; @@ -35,50 +46,45 @@ static struct mc_saved_data { } mc_saved_data; static enum ucode_state -generic_load_microcode_early(struct microcode_intel **mc_saved_p, - unsigned int mc_saved_count, - struct ucode_cpu_info *uci) +load_microcode_early(struct microcode_intel **saved, + unsigned int num_saved, struct ucode_cpu_info *uci) { struct microcode_intel *ucode_ptr, *new_mc = NULL; - int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; - unsigned int csig = uci->cpu_sig.sig; - unsigned int cpf = uci->cpu_sig.pf; - int i; + struct microcode_header_intel *mc_hdr; + int new_rev, ret, i; - for (i = 0; i < mc_saved_count; i++) { - ucode_ptr = mc_saved_p[i]; + new_rev = uci->cpu_sig.rev; - mc_header = (struct microcode_header_intel *)ucode_ptr; - mc_size = get_totalsize(mc_header); - if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { - new_rev = mc_header->rev; - new_mc = ucode_ptr; - } - } + for (i = 0; i < num_saved; i++) { + ucode_ptr = saved[i]; + mc_hdr = (struct microcode_header_intel *)ucode_ptr; - if (!new_mc) { - state = UCODE_NFOUND; - goto out; + ret = get_matching_microcode(uci->cpu_sig.sig, + uci->cpu_sig.pf, + new_rev, + ucode_ptr); + if (!ret) + continue; + + new_rev = mc_hdr->rev; + new_mc = ucode_ptr; } + if (!new_mc) + return UCODE_NFOUND; + uci->mc = (struct microcode_intel *)new_mc; -out: - return state; + return UCODE_OK; } -static void -microcode_pointer(struct microcode_intel **mc_saved, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start, int mc_saved_count) +static inline void +copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, + unsigned long off, int num_saved) { int i; - for (i = 0; i < mc_saved_count; i++) - mc_saved[i] = (struct microcode_intel *) - (mc_saved_in_initrd[i] + initrd_start); + for (i = 0; i < num_saved; i++) + mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); } #ifdef CONFIG_X86_32 @@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, #endif static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start, - struct ucode_cpu_info *uci) +load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long initrd_start, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int count = mc_saved_data->mc_saved_count; if (!mc_saved_data->mc_saved) { - microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, - initrd_start, count); + copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); - return generic_load_microcode_early(mc_saved_tmp, count, uci); + return load_microcode_early(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 microcode_phys(mc_saved_tmp, mc_saved_data); - return generic_load_microcode_early(mc_saved_tmp, count, uci); + return load_microcode_early(mc_saved_tmp, count, uci); #else - return generic_load_microcode_early(mc_saved_data->mc_saved, + return load_microcode_early(mc_saved_data->mc_saved, count, uci); #endif } } -static u8 get_x86_family(unsigned long sig) -{ - u8 x86; - - x86 = (sig >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (sig >> 20) & 0xff; - - return x86; -} - -static u8 get_x86_model(unsigned long sig) -{ - u8 x86, x86_model; - - x86 = get_x86_family(sig); - x86_model = (sig >> 4) & 0xf; - - if (x86 == 0x6 || x86 == 0xf) - x86_model += ((sig >> 16) & 0xf) << 4; - - return x86_model; -} - /* * Given CPU signature and a microcode patch, this function finds if the * microcode patch has matching family and model with the CPU. @@ -159,42 +137,40 @@ static enum ucode_state matching_model_microcode(struct microcode_header_intel *mc_header, unsigned long sig) { - u8 x86, x86_model; - u8 x86_ucode, x86_model_ucode; + unsigned int fam, model; + unsigned int fam_ucode, model_ucode; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); unsigned long data_size = get_datasize(mc_header); int ext_sigcount, i; struct extended_signature *ext_sig; - x86 = get_x86_family(sig); - x86_model = get_x86_model(sig); + fam = __x86_family(sig); + model = x86_model(sig); - x86_ucode = get_x86_family(mc_header->sig); - x86_model_ucode = get_x86_model(mc_header->sig); + fam_ucode = __x86_family(mc_header->sig); + model_ucode = x86_model(mc_header->sig); - if (x86 == x86_ucode && x86_model == x86_model_ucode) + if (fam == fam_ucode && model == model_ucode) return UCODE_OK; /* Look for ext. headers: */ if (total_size <= data_size + MC_HEADER_SIZE) return UCODE_NFOUND; - ext_header = (struct extended_sigtable *) - mc_header + data_size + MC_HEADER_SIZE; + ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for (i = 0; i < ext_sigcount; i++) { - x86_ucode = get_x86_family(ext_sig->sig); - x86_model_ucode = get_x86_model(ext_sig->sig); + fam_ucode = __x86_family(ext_sig->sig); + model_ucode = x86_model(ext_sig->sig); - if (x86 == x86_ucode && x86_model == x86_model_ucode) + if (fam == fam_ucode && model == model_ucode) return UCODE_OK; ext_sig++; } - return UCODE_NFOUND; } @@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data, unsigned int mc_saved_count) { int i, j; - struct microcode_intel **mc_saved_p; + struct microcode_intel **saved_ptr; int ret; if (!mc_saved_count) @@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data, /* * Copy new microcode data. */ - mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), - GFP_KERNEL); - if (!mc_saved_p) + saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + if (!saved_ptr) return -ENOMEM; for (i = 0; i < mc_saved_count; i++) { - struct microcode_intel *mc = mc_saved_src[i]; - struct microcode_header_intel *mc_header = &mc->hdr; - unsigned long mc_size = get_totalsize(mc_header); - mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); - if (!mc_saved_p[i]) { - ret = -ENOMEM; - goto err; - } + struct microcode_header_intel *mc_hdr; + struct microcode_intel *mc; + unsigned long size; + if (!mc_saved_src[i]) { ret = -EINVAL; goto err; } - memcpy(mc_saved_p[i], mc, mc_size); + + mc = mc_saved_src[i]; + mc_hdr = &mc->hdr; + size = get_totalsize(mc_hdr); + + saved_ptr[i] = kmalloc(size, GFP_KERNEL); + if (!saved_ptr[i]) { + ret = -ENOMEM; + goto err; + } + + memcpy(saved_ptr[i], mc, size); } /* * Point to newly saved microcode. */ - mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved = saved_ptr; mc_saved_data->mc_saved_count = mc_saved_count; return 0; err: for (j = 0; j <= i; j++) - kfree(mc_saved_p[j]); - kfree(mc_saved_p); + kfree(saved_ptr[j]); + kfree(saved_ptr); return ret; } @@ -257,48 +239,45 @@ err: * - or if it is a newly discovered microcode patch. * * The microcode patch should have matching model with CPU. + * + * Returns: The updated number @num_saved of saved microcode patches. */ -static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, - unsigned int *mc_saved_count_p) +static unsigned int _save_mc(struct microcode_intel **mc_saved, + u8 *ucode_ptr, unsigned int num_saved) { - int i; - int found = 0; - unsigned int mc_saved_count = *mc_saved_count_p; - struct microcode_header_intel *mc_header; + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + unsigned int sig, pf, new_rev; + int found = 0, i; + + mc_hdr = (struct microcode_header_intel *)ucode_ptr; + + for (i = 0; i < num_saved; i++) { + mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + new_rev = mc_hdr->rev; + + if (!get_matching_sig(sig, pf, new_rev, ucode_ptr)) + continue; + + found = 1; + + if (!revision_is_newer(mc_hdr, new_rev)) + continue; - mc_header = (struct microcode_header_intel *)ucode_ptr; - for (i = 0; i < mc_saved_count; i++) { - unsigned int sig, pf; - unsigned int new_rev; - struct microcode_header_intel *mc_saved_header = - (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - new_rev = mc_header->rev; - - if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { - found = 1; - if (update_match_revision(mc_header, new_rev)) { - /* - * Found an older ucode saved before. - * Replace the older one with this newer - * one. - */ - mc_saved[i] = - (struct microcode_intel *)ucode_ptr; - break; - } - } - } - if (i >= mc_saved_count && !found) /* - * This ucode is first time discovered in ucode file. - * Save it to memory. + * Found an older ucode saved earlier. Replace it with + * this newer one. */ - mc_saved[mc_saved_count++] = - (struct microcode_intel *)ucode_ptr; + mc_saved[i] = (struct microcode_intel *)ucode_ptr; + break; + } + + /* Newly detected microcode, save it to memory. */ + if (i >= num_saved && !found) + mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; - *mc_saved_count_p = mc_saved_count; + return num_saved; } /* @@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start, continue; } - _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); ucode_ptr += mc_size; } @@ -372,7 +351,7 @@ out: static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; - u8 x86, x86_model; + unsigned int family, model; struct cpu_signature csig; unsigned int eax, ebx, ecx, edx; @@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_cpuid(&eax, &ebx, &ecx, &edx); csig.sig = eax; - x86 = get_x86_family(csig.sig); - x86_model = get_x86_model(csig.sig); + family = __x86_family(csig.sig); + model = x86_model(csig.sig); - if ((x86_model >= 5) || (x86 > 6)) { + if ((model >= 5) || (family > 6)) { /* get processor flags from MSR 0x17 */ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); @@ -429,8 +408,7 @@ static void __ref show_saved_mc(void) sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; rev = uci.cpu_sig.rev; - pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", - smp_processor_id(), sig, pf, rev); + pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); for (i = 0; i < mc_saved_data.mc_saved_count; i++) { struct microcode_header_intel *mc_saved_header; @@ -457,8 +435,7 @@ static void __ref show_saved_mc(void) if (total_size <= data_size + MC_HEADER_SIZE) continue; - ext_header = (struct extended_sigtable *) - mc_saved_header + data_size + MC_HEADER_SIZE; + ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc) * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. */ - - _save_mc(mc_saved_tmp, mc, &mc_saved_count); + mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); /* * Save the mc_save_tmp in global mc_saved_data. @@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early); static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(unsigned long start, unsigned long end, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - struct ucode_cpu_info *uci) +scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long start, unsigned long size, + struct ucode_cpu_info *uci) { - unsigned int size = end - start + 1; struct cpio_data cd; long offset = 0; #ifdef CONFIG_X86_32 @@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end, if (!cd.data) return UCODE_ERROR; - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, mc_saved_in_initrd, - uci); + mc_saved_data, initrd, uci); } /* @@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void) if (count == 0) return ret; - microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void) static void __init _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start_early, - unsigned long initrd_end_early, - struct ucode_cpu_info *uci) + unsigned long *initrd, + unsigned long start, unsigned long size) { + struct ucode_cpu_info uci; enum ucode_state ret; - collect_cpu_info_early(uci); - scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, - mc_saved_in_initrd, uci); + collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data, mc_saved_in_initrd, - initrd_start_early, uci); + ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + if (ret != UCODE_OK) + return; - if (ret == UCODE_OK) - apply_microcode_early(uci, true); + ret = load_microcode(mc_saved_data, initrd, start, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, true); } -void __init -load_ucode_intel_bsp(void) +void __init load_ucode_intel_bsp(void) { - u64 ramdisk_image, ramdisk_size; - unsigned long initrd_start_early, initrd_end_early; - struct ucode_cpu_info uci; + u64 start, size; #ifdef CONFIG_X86_32 - struct boot_params *boot_params_p; + struct boot_params *p; - boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); - ramdisk_image = boot_params_p->hdr.ramdisk_image; - ramdisk_size = boot_params_p->hdr.ramdisk_size; - initrd_start_early = ramdisk_image; - initrd_end_early = initrd_start_early + ramdisk_size; + p = (struct boot_params *)__pa_nodebug(&boot_params); + start = p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - initrd_start_early, initrd_end_early, &uci); + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + start, size); #else - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; - initrd_start_early = ramdisk_image + PAGE_OFFSET; - initrd_end_early = initrd_start_early + ramdisk_size; - - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, - initrd_start_early, initrd_end_early, - &uci); + start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; + size = boot_params.hdr.ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); #endif } @@ -771,6 +735,7 @@ void load_ucode_intel_ap(void) struct ucode_cpu_info uci; unsigned long *mc_saved_in_initrd_p; unsigned long initrd_start_addr; + enum ucode_state ret; #ifdef CONFIG_X86_32 unsigned long *initrd_start_p; @@ -793,8 +758,12 @@ void load_ucode_intel_ap(void) return; collect_cpu_info_early(&uci); - load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); + ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + + if (ret != UCODE_OK) + return; + apply_microcode_early(&uci, true); } @@ -808,8 +777,8 @@ void reload_ucode_intel(void) collect_cpu_info_early(&uci); - ret = generic_load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); + ret = load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); if (ret != UCODE_OK) return; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index ce69320d0179..cd47a510a3f1 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf, return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; } -int -update_match_revision(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} - int microcode_sanity_check(void *mc, int print_err) { unsigned long total_size, data_size, ext_table_size; @@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err) EXPORT_SYMBOL_GPL(microcode_sanity_check); /* - * return 0 - no update found - * return 1 - found update + * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) { struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; @@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) } /* - * return 0 - no update found - * return 1 - found update + * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc) { - struct microcode_header_intel *mc_header = mc; + struct microcode_header_intel *mc_hdr = mc; - if (!update_match_revision(mc_header, rev)) + if (!revision_is_newer(mc_hdr, rev)) return 0; - return get_matching_sig(csig, cpf, mc, rev); + return get_matching_sig(csig, cpf, rev, mc); } EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 36d99a337b49..3f20710a5b23 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -6,7 +6,7 @@ IN=$1 OUT=$2 -function dump_array() +dump_array() { ARRAY=$1 SIZE=$2 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..e2888a3ad1e3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2147,24 +2147,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) static unsigned long code_segment_base(struct pt_regs *regs) { /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ + +#ifdef CONFIG_X86_32 + /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; - /* - * For IA32 we look at the GDT/LDT segment base to convert the - * effective IP to a linear address. - */ -#ifdef CONFIG_X86_32 if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else - if (test_thread_flag(TIF_IA32)) { - if (user_mode(regs) && regs->cs != __USER32_CS) - return get_segment_base(regs->cs); - } + if (user_mode(regs) && !user_64bit_mode(regs) && + regs->cs != __USER32_CS) + return get_segment_base(regs->cs); #endif return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 498b6d967138..258990688a5e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -212,11 +212,11 @@ static struct event_constraint intel_hsw_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ - INTEL_EVENT_CONSTRAINT(0x08a3, 0x4), + INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ - INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4), + INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ - INTEL_EVENT_CONSTRAINT(0x04a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), EVENT_CONSTRAINT_END }; @@ -1649,11 +1649,11 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - c = intel_pebs_constraints(event); + c = intel_shared_regs_constraints(cpuc, event); if (c) return c; - c = intel_shared_regs_constraints(cpuc, event); + c = intel_pebs_constraints(event); if (c) return c; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index aceb2f90c716..c76d3e37c6e1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3d3503351242..6367a780cc8c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void) initial_boot_params = dt = early_memremap(initial_dtb, map_len); size = of_get_flat_dt_size(); if (map_len < size) { - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); initial_boot_params = dt = early_memremap(initial_dtb, size); map_len = size; } unflatten_and_copy_device_tree(); - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); } #else static inline void x86_flattree_get_config(void) { } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cf3df1d8d039..9c30acfadae2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -25,10 +25,12 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; -static void printk_stack_address(unsigned long address, int reliable) +static void printk_stack_address(unsigned long address, int reliable, + void *data) { - pr_cont(" [<%p>] %s%pB\n", - (void *)address, reliable ? "" : "? ", (void *)address); + printk("%s [<%p>] %s%pB\n", + (char *)data, (void *)address, reliable ? "" : "? ", + (void *)address); } void printk_address(unsigned long address) @@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name) static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); - printk(data); - printk_stack_address(addr, reliable); + printk_stack_address(addr, reliable, data); } static const struct stacktrace_ops print_trace_ops = { @@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err) print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 - if (user_mode_vm(regs)) { + if (user_mode(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; } else { @@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode_vm(regs)) + if (!user_mode(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5abd4cd4230c..464ffd69b92e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, for (i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; - if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - pr_cont("\n"); - pr_cont(" %08lx", *stack++); + if ((i % STACKSLOTS_PER_LINE) == 0) { + if (i != 0) + pr_cont("\n"); + printk("%s %08lx", log_lvl, *stack++); + } else + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } pr_cont("\n"); @@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs) int i; show_regs_print_info(KERN_EMERG); - __show_regs(regs, !user_mode_vm(regs)); + __show_regs(regs, !user_mode(regs)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ff86f19b5758..5f1c6266eb30 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, pr_cont(" <EOI> "); } } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) + if (kstack_end(stack)) break; } - if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - pr_cont("\n"); - pr_cont(" %016lx", *stack++); + if ((i % STACKSLOTS_PER_LINE) == 0) { + if (i != 0) + pr_cont("\n"); + printk("%s %016lx", log_lvl, *stack++); + } else + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46201deee923..7d46bb260334 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - early_iounmap(sdata, data_len); + early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index a62536a1be88..49ff55ef9b26 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ -static void mem32_serial_out(unsigned long addr, int offset, int value) -{ - uint32_t *vaddr = (uint32_t *)addr; - /* shift implied by pointer type */ - writel(value, vaddr + offset); -} - -static unsigned int mem32_serial_in(unsigned long addr, int offset) -{ - uint32_t *vaddr = (uint32_t *)addr; - /* shift implied by pointer type */ - return readl(vaddr + offset); -} - static unsigned int io_serial_in(unsigned long addr, int offset) { return inb(addr + offset); @@ -205,6 +191,20 @@ static __init void early_serial_init(char *s) } #ifdef CONFIG_PCI +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + u32 *vaddr = (u32 *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + u32 *vaddr = (u32 *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + /* * early_pci_serial_init() * @@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s) unsigned divisor; unsigned long baud = DEFAULT_BAUD; u8 bus, slot, func; - uint32_t classcode, bar0; - uint16_t cmdreg; + u32 classcode, bar0; + u16 cmdreg; char *e; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 000d4199b03e..1c309763e321 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -395,10 +395,13 @@ sysenter_past_esp: /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary - 4*4 means the 4 words - * pushed above; +8 corresponds to copy_thread's esp0 setting. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax @@ -432,7 +435,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jne sysexit_audit + jnz sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -460,7 +463,7 @@ sysenter_audit: sysexit_audit: testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jne syscall_exit_work + jnz syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ @@ -472,7 +475,7 @@ sysexit_audit: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jne syscall_exit_work + jnz syscall_exit_work movl PT_EAX(%esp),%eax /* reload syscall return value */ jmp sysenter_exit #endif @@ -510,7 +513,7 @@ syscall_exit: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx # current->work - jne syscall_exit_work + jnz syscall_exit_work restore_all: TRACE_IRQS_IRET @@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and #ifdef CONFIG_VM86 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) movl %esp, %eax - jne work_notifysig_v86 # returning to kernel-space or + jnz work_notifysig_v86 # returning to kernel-space or # vm86-space 1: #else @@ -720,43 +723,22 @@ END(sysenter_badsys) .endm /* - * Build the entry stubs and pointer table with some assembler magic. - * We pack 7 stubs into a single 32-byte chunk, which will fit in a - * single cache line on all modern x86 implementations. + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. */ -.section .init.rodata,"a" -ENTRY(interrupt) -.section .entry.text, "ax" - .p2align 5 - .p2align CONFIG_X86_L1_CACHE_SHIFT + .align 8 ENTRY(irq_entries_start) RING0_INT_FRAME -vector=FIRST_EXTERNAL_VECTOR -.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 - .balign 32 - .rept 7 - .if vector < FIRST_SYSTEM_VECTOR - .if vector <> FIRST_EXTERNAL_VECTOR + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 - jmp 2f - .endif - .previous - .long 1b - .section .entry.text, "ax" -vector=vector+1 - .endif - .endr -2: jmp common_interrupt -.endr + .align 8 + .endr END(irq_entries_start) -.previous -END(interrupt) -.previous - /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: @@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error) pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl_cfi $do_general_protection -662: -.section .altinstructions,"a" - altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f -.previous -.section .altinstr_replacement,"ax" -663: pushl $do_simd_coprocessor_error -664: -.previous + ALTERNATIVE "pushl_cfi $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM #else pushl_cfi $do_simd_coprocessor_error #endif @@ -982,6 +958,9 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp ret_from_intr CFI_ENDPROC ENDPROC(xen_hypervisor_callback) @@ -1237,20 +1216,13 @@ error_code: /*CFI_REL_OFFSET es, 0*/ pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index db13655c3a2a..c7b238494b31 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -14,27 +14,14 @@ * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP + * - iret frame: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers up to R11. - * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better * backtraces. They don't change any code. - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. - * There are unfortunately lots of special cases where some registers - * not touched. The macro is a big mess that should be cleaned up. - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. - * Gives a full stack frame. * - ENTRY/END Define functions in the symbol table. - * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack - * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. * - idtentry - Define exception entry points. */ @@ -70,10 +57,6 @@ .section .entry.text, "ax" -#ifndef CONFIG_PREEMPT -#define retint_kernel retint_restore_args -#endif - #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) swapgs @@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +.macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: @@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64) call debug_stack_reset .endm -.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64) #endif /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. - * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs - * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp offset=0 - movq PER_CPU_VAR(old_rsp),\tmp - movq \tmp,RSP+\offset(%rsp) - movq $__USER_DS,SS+\offset(%rsp) - movq $__USER_CS,CS+\offset(%rsp) - movq RIP+\offset(%rsp),\tmp /* get rip */ - movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ - movq R11+\offset(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS+\offset(%rsp) - .endm - - .macro RESTORE_TOP_OF_STACK tmp offset=0 - movq RSP+\offset(%rsp),\tmp - movq \tmp,PER_CPU_VAR(old_rsp) - movq EFLAGS+\offset(%rsp),\tmp - movq \tmp,R11+\offset(%rsp) - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) + * empty frame */ .macro EMPTY_FRAME start=1 offset=0 .if \start @@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64) * initial frame state for interrupts (and exceptions without error code) */ .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, SS+8+\offset-RIP - /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ - CFI_REL_OFFSET rsp, RSP+\offset-RIP - /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ - /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ - CFI_REL_OFFSET rip, RIP+\offset-RIP + EMPTY_FRAME \start, 5*8+\offset + /*CFI_REL_OFFSET ss, 4*8+\offset*/ + CFI_REL_OFFSET rsp, 3*8+\offset + /*CFI_REL_OFFSET rflags, 2*8+\offset*/ + /*CFI_REL_OFFSET cs, 1*8+\offset*/ + CFI_REL_OFFSET rip, 0*8+\offset .endm /* @@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64) * with vector already pushed) */ .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, RIP+\offset-ORIG_RAX - .endm - -/* - * frame that enables calling into C. - */ - .macro PARTIAL_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET - CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET - CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET - CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET - CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET - CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET - CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET - CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET - CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET - CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + INTR_FRAME \start, 1*8+\offset .endm /* * frame that enables passing a complete pt_regs to a C function. */ .macro DEFAULT_FRAME start=1 offset=0 - PARTIAL_FRAME \start, R11+\offset-R15 + XCPT_FRAME \start, ORIG_RAX+\offset + CFI_REL_OFFSET rdi, RDI+\offset + CFI_REL_OFFSET rsi, RSI+\offset + CFI_REL_OFFSET rdx, RDX+\offset + CFI_REL_OFFSET rcx, RCX+\offset + CFI_REL_OFFSET rax, RAX+\offset + CFI_REL_OFFSET r8, R8+\offset + CFI_REL_OFFSET r9, R9+\offset + CFI_REL_OFFSET r10, R10+\offset + CFI_REL_OFFSET r11, R11+\offset CFI_REL_OFFSET rbx, RBX+\offset CFI_REL_OFFSET rbp, RBP+\offset CFI_REL_OFFSET r12, R12+\offset @@ -218,102 +167,30 @@ ENDPROC(native_usergs_sysret64) CFI_REL_OFFSET r15, R15+\offset .endm -ENTRY(save_paranoid) - XCPT_FRAME 1 RDI+8 - cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret - CFI_ENDPROC -END(save_paranoid) - /* - * A newly forked process directly context switches into this address. + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - DEFAULT_FRAME - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - GET_THREAD_INFO(%rcx) - - RESTORE_REST - - testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - jz 1f - - testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET - jnz int_ret_from_sys_call - - RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET - jmp ret_from_sys_call # go to the SYSRET fastpath - -1: - subq $REST_SKIP, %rsp # leave space for volatiles - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(ret_from_fork) - -/* - * System call entry. Up to 6 arguments in registers are supported. + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. However, it does mask the flags register for us, so - * CLD and CLAC are not needed. - */ - -/* - * Register setup: + * Registers on entry: * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) * rdi arg0 - * rcx return address for syscall/sysret, C arg3 * rsi arg1 * rdx arg2 - * r10 arg3 (--> moved to rcx for C) + * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) * - * Interrupts are off on entry. * Only called from user space. * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - * - * When user can change the frames always force IRET. That is because + * When user can change pt_regs->foo always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */ @@ -321,9 +198,15 @@ END(ret_from_fork) ENTRY(system_call) CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ SWAPGS_UNSAFE_STACK /* * A hypervisor implementation might want to use a label @@ -332,18 +215,38 @@ ENTRY(system_call) */ GLOBAL(system_call_after_swapgs) - movq %rsp,PER_CPU_VAR(old_rsp) + movq %rsp,PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack),%rsp + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER_DS /* pt_regs->ss */ + pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* - * No need to follow this irqs off/on section - it's straight - * and short: + * Re-enable interrupts. + * We use 'rsp_scratch' as a scratch space, hence irq-off block above + * must execute atomically in the face of possible interrupt-driven + * task preemption. We must enable interrupts only after we're done + * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8, 0, rax_enosys=1 - movq_cfi rax,(ORIG_RAX-ARGOFFSET) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + pushq_cfi %r11 /* pt_regs->flags */ + pushq_cfi $__USER_CS /* pt_regs->cs */ + pushq_cfi %rcx /* pt_regs->ip */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq_cfi_reg r8 /* pt_regs->r8 */ + pushq_cfi_reg r9 /* pt_regs->r9 */ + pushq_cfi_reg r10 /* pt_regs->r10 */ + pushq_cfi_reg r11 /* pt_regs->r11 */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 6*8 + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -352,82 +255,96 @@ system_call_fastpath: andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja ret_from_sys_call /* and return regs->ax */ + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10,%rcx - call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: /* - * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. + * Syscall return path ending with SYSRET (fast path). + * Has incompletely filled pt_regs. */ -ret_from_sys_call: - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) - jnz int_ret_from_sys_call_fixup /* Go the the slow path */ - LOCKDEP_SYS_EXIT + /* + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - CFI_REMEMBER_STATE + /* - * sysretq will re-enable interrupts: + * We must check ti flags with interrupts (or at least preemption) + * off because we must *never* return to userspace without + * processing exit work that is enqueued if we're preempted here. + * In particular, returning to userspace with any of the one-shot + * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is + * very bad. */ - TRACE_IRQS_ON - movq RIP-ARGOFFSET(%rsp),%rcx + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + + CFI_REMEMBER_STATE + + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx - RESTORE_ARGS 1,-ARG_SKIP,0 + movq EFLAGS(%rsp),%r11 /*CFI_REGISTER rflags,r11*/ - movq PER_CPU_VAR(old_rsp), %rsp + movq RSP(%rsp),%rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * Restoration of rflags re-enables interrupts. + */ USERGS_SYSRET64 CFI_RESTORE_STATE -int_ret_from_sys_call_fixup: - FIXUP_TOP_OF_STACK %r11, -ARGOFFSET - jmp int_ret_from_sys_call - - /* Do syscall tracing */ + /* Do syscall entry tracing */ tracesys: - leaq -REST_SKIP(%rsp), %rdi - movq $AUDIT_ARCH_X86_64, %rsi + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi call syscall_trace_enter_phase1 test %rax, %rax jnz tracesys_phase2 /* if needed, run the slow path */ - LOAD_ARGS 0 /* else restore clobbered regs */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax jmp system_call_fastpath /* and return to the fast path */ tracesys_phase2: - SAVE_REST - FIXUP_TOP_OF_STACK %rdi + SAVE_EXTRA_REGS movq %rsp, %rdi - movq $AUDIT_ARCH_X86_64, %rsi + movl $AUDIT_ARCH_X86_64, %esi movq %rax,%rdx call syscall_trace_enter_phase2 /* - * Reload arg registers from stack in case ptrace changed them. + * Reload registers from stack in case ptrace changed them. * We don't reload %rax because syscall_trace_entry_phase2() returned * the value it wants us to use in the table lookup. */ - LOAD_ARGS ARGOFFSET, 1 - RESTORE_REST + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja int_ret_from_sys_call /* RAX(%rsp) is already set */ + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - /* Use IRET because user could have changed frame */ + movq %rax,RAX(%rsp) +1: + /* Use IRET because user could have changed pt_regs->foo */ /* * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. + * Has correct iret frame. */ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) +int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ @@ -437,8 +354,8 @@ GLOBAL(int_with_check) movl TI_flags(%rcx),%edx andl %edi,%edx jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp retint_swapgs + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return /* Either reschedule or signal or syscall exit tracking needed. */ /* First do a reschedule test. */ @@ -455,12 +372,11 @@ int_careful: TRACE_IRQS_OFF jmp int_with_check - /* handle signals and tracing -- both require a full stack frame */ + /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) -int_check_syscall_exit_work: - SAVE_REST + SAVE_EXTRA_REGS /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal @@ -479,86 +395,192 @@ int_signal: call do_notify_resume 1: movl $_TIF_WORK_MASK,%edi int_restore_rest: - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check + +syscall_return: + /* The IRETQ could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. It's not worth + * testing for canonicalness exactly -- this check detects any + * of the 17 high bits set, which is true for non-canonical + * or kernel addresses. (This will pessimize vsyscall=native. + * Big deal.) + * + * If virtual addresses ever become wider, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- SYSRET checks need update" + .endif + shr $__VIRTUAL_MASK_SHIFT, %rcx + jnz opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + CFI_REMEMBER_STATE + /* r11 is already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + CFI_RESTORE_STATE + +opportunistic_sysret_failed: + SWAPGS + jmp restore_c_regs_and_iret CFI_ENDPROC END(system_call) + .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC - popq %r11 /* save return address */ - PARTIAL_FRAME 0 - SAVE_REST - pushq %r11 /* put it back on stack */ - FIXUP_TOP_OF_STACK %r11, 8 - DEFAULT_FRAME 0 8 /* offset 8: return address */ - call sys_\func - RESTORE_TOP_OF_STACK %r11, 8 - ret $REST_SKIP /* pop extended registers */ + DEFAULT_FRAME 0, 8 /* offset 8: return address */ + SAVE_EXTRA_REGS 8 + jmp sys_\func CFI_ENDPROC END(stub_\func) .endm - .macro FIXED_FRAME label,func -ENTRY(\label) - CFI_STARTPROC - PARTIAL_FRAME 0 8 /* offset 8: return address */ - FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET - call \func - RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET - ret - CFI_ENDPROC -END(\label) - .endm - FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork - FIXED_FRAME stub_iopl, sys_iopl ENTRY(stub_execve) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call sys_execve - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call sys_execve +return_from_execve: + testl %eax, %eax + jz 1f + /* exec failed, can use fast SYSRET code path in this case */ + ret +1: + /* must use IRET code path (pt_regs->cs may have changed) */ + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + ZERO_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - -ENTRY(stub_execveat) +/* + * Remaining execve stubs are only 7 bytes long. + * ENTRY() often aligns to 16 bytes, which in this case has no benefits. + */ + .align 8 +GLOBAL(stub_execveat) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call sys_execveat - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call sys_execveat + jmp return_from_execve CFI_ENDPROC END(stub_execveat) +#ifdef CONFIG_X86_X32_ABI + .align 8 +GLOBAL(stub_x32_execve) + CFI_STARTPROC + DEFAULT_FRAME 0, 8 + call compat_sys_execve + jmp return_from_execve + CFI_ENDPROC +END(stub_x32_execve) + .align 8 +GLOBAL(stub_x32_execveat) + CFI_STARTPROC + DEFAULT_FRAME 0, 8 + call compat_sys_execveat + jmp return_from_execve + CFI_ENDPROC +END(stub_x32_execveat) +#endif + +#ifdef CONFIG_IA32_EMULATION + .align 8 +GLOBAL(stub32_execve) + CFI_STARTPROC + call compat_sys_execve + jmp return_from_execve + CFI_ENDPROC +END(stub32_execve) + .align 8 +GLOBAL(stub32_execveat) + CFI_STARTPROC + call compat_sys_execveat + jmp return_from_execve + CFI_ENDPROC +END(stub32_execveat) +#endif + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 + DEFAULT_FRAME 0, 8 + /* + * SAVE_EXTRA_REGS result is not normally needed: + * sigreturn overwrites all pt_regs->GPREGS. + * But sigreturn can fail (!), and there is no easy way to detect that. + * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, + * we SAVE_EXTRA_REGS here. + */ + SAVE_EXTRA_REGS 8 call sys_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST +return_from_stub: + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + RESTORE_EXTRA_REGS + movq %rax,RAX(%rsp) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_rt_sigreturn) @@ -566,86 +588,70 @@ END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 + DEFAULT_FRAME 0, 8 + SAVE_EXTRA_REGS 8 call sys32_x32_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST - jmp int_ret_from_sys_call + jmp return_from_stub CFI_ENDPROC END(stub_x32_rt_sigreturn) +#endif -ENTRY(stub_x32_execve) - CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call compat_sys_execve - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(stub_x32_execve) +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME -ENTRY(stub_x32_execveat) - CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call compat_sys_execveat - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq_cfi $0x0002 + popfq_cfi # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + RESTORE_EXTRA_REGS + + testl $3,CS(%rsp) # from kernel_thread? + + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use IRET code path to return, since it can safely handle + * all of the above. + */ + jnz int_ret_from_sys_call + + /* We came from kernel_thread */ + /* nb: we depend on RESTORE_EXTRA_REGS above */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC -END(stub_x32_execveat) - -#endif +END(ret_from_fork) /* - * Build the entry stubs and pointer table with some assembler magic. - * We pack 7 stubs into a single 32-byte chunk, which will fit in a - * single cache line on all modern x86 implementations. + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. */ - .section .init.rodata,"a" -ENTRY(interrupt) - .section .entry.text - .p2align 5 - .p2align CONFIG_X86_L1_CACHE_SHIFT + .align 8 ENTRY(irq_entries_start) INTR_FRAME -vector=FIRST_EXTERNAL_VECTOR -.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 - .balign 32 - .rept 7 - .if vector < FIRST_SYSTEM_VECTOR - .if vector <> FIRST_EXTERNAL_VECTOR + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt CFI_ADJUST_CFA_OFFSET -8 - .endif -1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 - jmp 2f - .endif - .previous - .quad 1b - .section .entry.text -vector=vector+1 - .endif - .endr -2: jmp common_interrupt -.endr + .align 8 + .endr CFI_ENDPROC END(irq_entries_start) -.previous -END(interrupt) -.previous - /* * Interrupt entry/exit. * @@ -656,47 +662,45 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - /* reserve pt_regs for scratch regs and rbp */ - subq $ORIG_RAX-RBP, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP cld - /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, (RDI-RBP) - movq_cfi rsi, (RSI-RBP) - movq_cfi rdx, (RDX-RBP) - movq_cfi rcx, (RCX-RBP) - movq_cfi rax, (RAX-RBP) - movq_cfi r8, (R8-RBP) - movq_cfi r9, (R9-RBP) - movq_cfi r10, (R10-RBP) - movq_cfi r11, (R11-RBP) - - /* Save rbp so that we can unwind from get_irq_regs() */ - movq_cfi rbp, 0 - - /* Save previous stack value */ - movq %rsp, %rsi + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP + + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS-RBP(%rsi) + testl $3, CS-RBP(%rsp) je 1f SWAPGS +1: /* + * Save previous stack pointer, optionally switch to interrupt stack. * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ -1: incl PER_CPU_VAR(irq_count) + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi - - /* Store previous stack value */ pushq %rsi + /* + * For debugger: + * "CFA (Current Frame Address) is the value on stack + offset" + */ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 */, 0, \ + 0x77 /* DW_OP_breg7 (rsp) */, 0, \ 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -714,7 +718,7 @@ common_interrupt: ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ - /* 0(%rsp): old_rsp-ARGOFFSET */ + /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -722,19 +726,18 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ - leaq ARGOFFSET-RBP(%rsi), %rsp + CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ + /* return code expects complete pt_regs - adjust rsp accordingly: */ + leaq -RBP(%rsi),%rsp CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET + CFI_ADJUST_CFA_OFFSET RBP -exit_intr: - GET_THREAD_INFO(%rcx) - testl $3,CS-ARGOFFSET(%rsp) + testl $3,CS(%rsp) je retint_kernel - /* Interrupt came from user space */ + + GET_THREAD_INFO(%rcx) /* - * Has a correct top of stack, but a partial stack frame * %rcx: thread info. Interrupts off. */ retint_with_reschedule: @@ -753,70 +756,34 @@ retint_swapgs: /* return to user-space */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq (RCX-R11)(%rsp), %rcx - cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) - * - * If virtual addresses ever become wider, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- sysret checks need update" - .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed - - cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq (R11-ARGOFFSET)(%rsp), %r11 - cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -irq_return_via_sysret: - CFI_REMEMBER_STATE - RESTORE_ARGS 1,8,1 - movq (RSP-RIP)(%rsp),%rsp - USERGS_SYSRET64 - CFI_RESTORE_STATE - -opportunistic_sysret_failed: SWAPGS - jmp restore_args + jmp restore_c_regs_and_iret -retint_restore_args: /* return to kernel space */ - DISABLE_INTERRUPTS(CLBR_ANY) +/* Returning to kernel space */ +retint_kernel: +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ + bt $9,EFLAGS(%rsp) /* interrupts were off? */ + jnc 1f +0: cmpl $0,PER_CPU_VAR(__preempt_count) + jnz 1f + call preempt_schedule_irq + jmp 0b +1: +#endif /* * The iretq could re-enable interrupts: */ TRACE_IRQS_IRETQ -restore_args: - RESTORE_ARGS 1,8,1 + +/* + * At this label, code paths which return to kernel and to user, + * which come from interrupts/exception and from syscalls, merge. + */ +restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 irq_return: INTERRUPT_RETURN @@ -887,28 +854,17 @@ retint_signal: jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_REST + SAVE_EXTRA_REGS movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) jmp retint_with_reschedule -#ifdef CONFIG_PREEMPT - /* Returning to kernel space. Check if we need preemption */ - /* rcx: threadinfo. interrupts off. */ -ENTRY(retint_kernel) - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ - jnc retint_restore_args - call preempt_schedule_irq - jmp exit_intr -#endif CFI_ENDPROC END(common_interrupt) @@ -997,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) @@ -1019,8 +975,7 @@ ENTRY(\sym) pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ .endif - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 @@ -1028,10 +983,11 @@ ENTRY(\sym) testl $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif - call save_paranoid + call paranoid_entry .else call error_entry .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ DEFAULT_FRAME 0 @@ -1053,19 +1009,20 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit /* %ebx: no swapgs flag */ + jmp paranoid_exit .else - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif .if \paranoid == 1 @@ -1208,6 +1165,9 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) popq %rsp CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp error_exit CFI_ENDPROC END(xen_do_hypervisor_callback) @@ -1266,7 +1226,9 @@ ENTRY(xen_failsafe_callback) addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS jmp error_exit CFI_ENDPROC END(xen_failsafe_callback) @@ -1298,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif - /* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + XCPT_FRAME 1 15*8 + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(paranoid_entry) - /* ebx: no swapgs flag */ +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - TRACE_IRQS_IRETQ 0 + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - RESTORE_ALL 8 - INTERRUPT_RETURN -paranoid_restore: - TRACE_IRQS_IRETQ_DEBUG 0 - RESTORE_ALL 8 + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN CFI_ENDPROC END(paranoid_exit) /* - * Exception entry point. This expects an error code/orig_rax on the stack. - * returns in "no swapgs flag" in %ebx. + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME - CFI_ADJUST_CFA_OFFSET 15*8 - /* oldrax contains error code */ + XCPT_FRAME 1 15*8 cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq %rdx, RDX+8(%rsp) - movq %rcx, RCX+8(%rsp) - movq %rax, RAX+8(%rsp) - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1360,12 +1329,12 @@ error_sti: TRACE_IRQS_OFF ret -/* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx @@ -1395,11 +1364,11 @@ error_bad_iret: END(error_entry) -/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) DEFAULT_FRAME movl %ebx,%eax - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -1414,19 +1383,7 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) -/* - * Test if a given stack is an NMI stack or not. - */ - .macro test_in_nmi reg stack nmi_ret normal_ret - cmpq %\reg, \stack - ja \normal_ret - subq $EXCEPTION_STKSZ, %\reg - cmpq %\reg, \stack - jb \normal_ret - jmp \nmi_ret - .endm - - /* runs on exception stack */ +/* Runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1462,7 +1419,7 @@ ENTRY(nmi) * NMI. */ - /* Use %rdx as out temp variable throughout */ + /* Use %rdx as our temp variable throughout */ pushq_cfi %rdx CFI_REL_OFFSET rdx, 0 @@ -1487,8 +1444,17 @@ ENTRY(nmi) * We check the variable because the first NMI could be in a * breakpoint routine using a breakpoint stack. */ - lea 6*8(%rsp), %rdx - test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ + CFI_REMEMBER_STATE nested_nmi: @@ -1581,7 +1547,7 @@ first_nmi: .rept 5 pushq_cfi 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1609,7 +1575,7 @@ repeat_nmi: pushq_cfi -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* @@ -1618,16 +1584,16 @@ end_repeat_nmi: * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK + /* - * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit * as we should not be calling schedule in NMI context. * Even with normal interrupts enabled. An NMI should not be * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call save_paranoid + call paranoid_entry DEFAULT_FRAME 0 /* @@ -1658,8 +1624,10 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS /* Pop the extra iret frame at once */ - RESTORE_ALL 6*8 + REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c4f8d4659070..2b55ee6db053 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) - early_printk("Kernel alive\n"); - clear_page(init_level4_pgt); /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..d031bad9e07e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -22,6 +22,7 @@ #include <asm/cpufeature.h> #include <asm/percpu.h> #include <asm/nops.h> +#include <asm/bootparam.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -90,7 +91,7 @@ ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 2f /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6fd514d9f69a..ae6588b301c2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -1,5 +1,5 @@ /* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> @@ -56,7 +56,7 @@ startup_64: * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from - * arch/x86_64/boot/compressed/head.S. + * arch/x86/boot/compressed/head_64.S. * * We only come here initially at boot nothing else comes here. * @@ -146,7 +146,7 @@ startup_64: leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ -1: testq $1, 0(%rdi) +1: testb $1, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5651fce0b71..367f39d35e9c 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -42,8 +42,8 @@ void kernel_fpu_enable(void) * be set (so that the clts/stts pair does nothing that is * visible in the interrupted kernel thread). * - * Except for the eagerfpu case when we return 1 unless we've already - * been eager and saved the state in kernel_fpu_begin(). + * Except for the eagerfpu case when we return true; in the likely case + * the thread has FPU but we are not going to set/clear TS. */ static inline bool interrupted_kernel_fpu_idle(void) { @@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void) return false; if (use_eager_fpu()) - return __thread_has_fpu(current); + return true; return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); @@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void) static inline bool interrupted_user_mode(void) { struct pt_regs *regs = get_irq_regs(); - return regs && user_mode_vm(regs); + return regs && user_mode(regs); } /* @@ -94,9 +94,10 @@ void __kernel_fpu_begin(void) if (__thread_has_fpu(me)) { __save_init_fpu(me); - } else if (!use_eager_fpu()) { + } else { this_cpu_write(fpu_owner_task, NULL); - clts(); + if (!use_eager_fpu()) + clts(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -107,7 +108,7 @@ void __kernel_fpu_end(void) if (__thread_has_fpu(me)) { if (WARN_ON(restore_fpu_checking(me))) - drop_init_fpu(me); + fpu_reset_state(me); } else if (!use_eager_fpu()) { stts(); } @@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk) { preempt_disable(); if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->thread.fpu_counter = 0; + if (use_eager_fpu()) { + __save_fpu(tsk); + } else { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } + } preempt_enable(); } EXPORT_SYMBOL(unlazy_fpu); @@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu) return; } + memset(fpu->state, 0, xstate_size); + if (cpu_has_fxsr) { fx_finit(&fpu->state->fxsave); } else { struct i387_fsave_struct *fp = &fpu->state->fsave; - memset(fp, 0, xstate_size); fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; @@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk) if (tsk_used_math(tsk)) { if (cpu_has_fpu && tsk == current) unlazy_fpu(tsk); - tsk->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(tsk); return 0; } @@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + struct xsave_struct *xsave = &target->thread.fpu.state->xsave; int ret; if (!cpu_has_xsave) @@ -350,14 +356,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, * memory layout in the thread struct, so that we can copy the entire * xstateregs to the user using one user_regset_copyout(). */ - memcpy(&target->thread.fpu.state->fxsave.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - + memcpy(&xsave->i387.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); /* * Copy the xstate memory layout. */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->xsave, 0, -1); + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); return ret; } @@ -365,8 +369,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + struct xsave_struct *xsave = &target->thread.fpu.state->xsave; int ret; - struct xsave_hdr_struct *xsave_hdr; if (!cpu_has_xsave) return -ENODEV; @@ -375,22 +379,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->xsave, 0, -1); - + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. */ - target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - - xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr; - - xsave_hdr->xstate_bv &= pcntxt_mask; + xsave->i387.mxcsr &= mxcsr_feature_mask; + xsave->xsave_hdr.xstate_bv &= pcntxt_mask; /* * These bits must be zero. */ - memset(xsave_hdr->reserved, 0, 48); - + memset(&xsave->xsave_hdr.reserved, 0, 48); return ret; } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 67b1cbe0093a..e5952c225532 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void) this_cpu = smp_processor_id(); cpumask_copy(&online_new, cpu_online_mask); - cpu_clear(this_cpu, online_new); + cpumask_clear_cpu(this_cpu, &online_new); this_count = 0; for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void) data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); - cpu_clear(this_cpu, affinity_new); + cpumask_clear_cpu(this_cpu, &affinity_new); /* Do not count inactive or per-cpu irqs. */ if (!irq_has_action(irq) || irqd_is_per_cpu(data)) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 28d28f5eb8f4..f9fd86a7fcc7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) if (unlikely(!desc)) return false; - if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { + if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); desc->handle_irq(irq, desc); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e4b503d5558c..394e643d7830 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); - if (user_mode_vm(regs)) + if (user_mode(regs)) return; if (regs->sp >= curbase + sizeof(struct thread_info) + diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 70e181ea1eac..cd10a6437264 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -178,7 +178,8 @@ void __init native_init_IRQ(void) #endif for_each_clear_bit_from(i, used_vectors, first_system_vector) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, irq_entries_start + + 8 * (i - FIRST_EXTERNAL_VECTOR)); } #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, used_vectors, NR_VECTORS) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 7ec1d5f8d283..d6178d9791db 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -72,7 +72,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "bx", 8, offsetof(struct pt_regs, bx) }, { "cx", 8, offsetof(struct pt_regs, cx) }, { "dx", 8, offsetof(struct pt_regs, dx) }, - { "si", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, si) }, { "di", 8, offsetof(struct pt_regs, di) }, { "bp", 8, offsetof(struct pt_regs, bp) }, { "sp", 8, offsetof(struct pt_regs, sp) }, @@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) #ifdef CONFIG_X86_32 switch (regno) { case GDB_SS: - if (!user_mode_vm(regs)) + if (!user_mode(regs)) *(unsigned long *)mem = __KERNEL_DS; break; case GDB_SP: - if (!user_mode_vm(regs)) + if (!user_mode(regs)) *(unsigned long *)mem = kernel_stack_pointer(regs); break; case GDB_GS: diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6a1146ea4d4d..24d079604fd5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -223,27 +223,48 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; + unsigned long faddr; kp = get_kprobe((void *)addr); - /* There is no probe, return original address */ - if (!kp) + faddr = ftrace_location(addr); + /* + * Addresses inside the ftrace location are refused by + * arch_check_ftrace_location(). Something went terribly wrong + * if such an address is checked here. + */ + if (WARN_ON(faddr && faddr != addr)) + return 0UL; + /* + * Use the current code if it is not modified by Kprobe + * and it cannot be modified by ftrace. + */ + if (!kp && !faddr) return addr; /* - * Basically, kp->ainsn.insn has an original instruction. - * However, RIP-relative instruction can not do single-stepping - * at different place, __copy_instruction() tweaks the displacement of - * that instruction. In that case, we can't recover the instruction - * from the kp->ainsn.insn. + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. * - * On the other hand, kp->opcode has a copy of the first byte of - * the probed instruction, which is overwritten by int3. And - * the instruction at kp->addr is not modified by kprobes except - * for the first byte, we can recover the original instruction - * from it and kp->opcode. + * On the other hand, in case on normal Kprobe, kp->opcode has a copy + * of the first byte of the probed instruction, which is overwritten + * by int3. And the instruction at kp->addr is not modified by kprobes + * except for the first byte, we can recover the original instruction + * from it and kp->opcode. + * + * In case of Kprobes using ftrace, we do not have a copy of + * the original instruction. In fact, the ftrace location might + * be modified at anytime and even could be in an inconsistent state. + * Fortunately, we know that the original code is the ideal 5-byte + * long NOP. */ - memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - buf[0] = kp->opcode; + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (faddr) + memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); + else + buf[0] = kp->opcode; return (unsigned long)buf; } @@ -251,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. + * Returns zero if the instruction can not get recovered. */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -285,6 +307,8 @@ static int can_probe(unsigned long paddr) * normally used, we just go through if there is no kprobe. */ __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); insn_get_length(&insn); @@ -333,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src) unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + if (!recovered_insn) + return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ @@ -576,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs) struct kprobe *p; struct kprobe_ctlblk *kcb; - if (user_mode_vm(regs)) + if (user_mode(regs)) return 0; addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); @@ -981,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, struct die_args *args = data; int ret = NOTIFY_DONE; - if (args->regs && user_mode_vm(args->regs)) + if (args->regs && user_mode(args->regs)) return ret; if (val == DIE_GPF) { diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 0dd8d089c315..7b3b9d15c47a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr) */ return 0; recovered_insn = recover_probed_instruction(buf, addr); + if (!recovered_insn) + return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 9bbb9b35c144..005c03e93fc5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/setup.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -53,7 +54,7 @@ static DEFINE_MUTEX(module_kaslr_mutex); static unsigned long int get_module_load_offset(void) { - if (kaslr_enabled) { + if (kaslr_enabled()) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861cc5ee8..da8cb987b973 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user, } /* - * RIP, flags, and the argument registers are usually saved. - * orig_ax is probably okay, too. + * These registers are always saved on 64-bit syscall entry. + * On 32-bit entry points, they are saved too except r8..r11. */ regs_user_copy->ip = user_regs->ip; + regs_user_copy->ax = user_regs->ax; regs_user_copy->cx = user_regs->cx; regs_user_copy->dx = user_regs->dx; regs_user_copy->si = user_regs->si; @@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, regs_user_copy->r11 = user_regs->r11; regs_user_copy->orig_ax = user_regs->orig_ax; regs_user_copy->flags = user_regs->flags; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; /* - * Don't even try to report the "rest" regs. + * Most system calls don't save these registers, don't report them. */ regs_user_copy->bx = -1; regs_user_copy->bp = -1; @@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user, /* * For this to be at all useful, we need a reasonable guess for - * sp and the ABI. Be careful: we're in NMI context, and we're + * the ABI. Be careful: we're in NMI context, and we're * considering current to be the current task, so we should * be careful not to look at any other percpu variables that might * change during context switches. */ - if (IS_ENABLED(CONFIG_IA32_EMULATION) && - task_thread_info(current)->status & TS_COMPAT) { - /* Easy case: we're in a compat syscall. */ - regs_user->abi = PERF_SAMPLE_REGS_ABI_32; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = user_regs->cs; - regs_user_copy->ss = user_regs->ss; - } else if (user_regs->orig_ax != -1) { - /* - * We're probably in a 64-bit syscall. - * Warning: this code is severely racy. At least it's better - * than just blindly copying user_regs. - */ - regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - regs_user_copy->sp = this_cpu_read(old_rsp); - regs_user_copy->cs = __USER_CS; - regs_user_copy->ss = __USER_DS; - regs_user_copy->cx = -1; /* usually contains garbage */ - } else { - /* We're probably in an interrupt or exception. */ - regs_user->abi = user_64bit_mode(user_regs) ? - PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = user_regs->cs; - regs_user_copy->ss = user_regs->ss; - } + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; regs_user->regs = regs_user_copy; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..8213da62b1b7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/pm.h> -#include <linux/clockchips.h> +#include <linux/tick.h> #include <linux/random.h> #include <linux/user-return-notifier.h> #include <linux/dmi.h> @@ -24,6 +24,7 @@ #include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> +#include <asm/mwait.h> #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/debugreg.h> @@ -37,7 +38,26 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = TOP_OF_INIT_STACK, +#ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +#endif + }, +#ifdef CONFIG_X86_32 + /* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, +#endif +}; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.fpu_counter = 0; dst->thread.fpu.has_fpu = 0; - dst->thread.fpu.last_cpu = ~0; dst->thread.fpu.state = NULL; + task_disable_lazy_fpu_restore(dst); if (tsk_used_math(src)) { int err = fpu_alloc(&dst->thread.fpu); if (err) @@ -109,7 +129,7 @@ void exit_thread(void) unsigned long *bp = t->io_bitmap_ptr; if (bp) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); @@ -131,13 +151,18 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - drop_init_fpu(tsk); - /* - * Free the FPU state for non xsave platforms. They get reallocated - * lazily at the first use. - */ - if (!use_eager_fpu()) + + if (!use_eager_fpu()) { + /* FPU state will be reallocated lazily at the first use. */ + drop_fpu(tsk); free_thread_xstate(tsk); + } else if (!used_math()) { + /* kthread execs. TODO: cleanup this horror. */ + if (WARN_ON(init_fpu(tsk))) + force_sig(SIGKILL, tsk); + user_fpu_begin(); + restore_init_xstate(); + } } static void hard_disable_TSC(void) @@ -377,14 +402,11 @@ static void amd_e400_idle(void) if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { cpumask_set_cpu(cpu, amd_e400_c1e_mask); - /* - * Force broadcast so ACPI can not interfere. - */ - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, - &cpu); + /* Force broadcast so ACPI can not interfere. */ + tick_broadcast_force(); pr_info("Switch to broadcast mode on CPU%d\n", cpu); } - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + tick_broadcast_enter(); default_idle(); @@ -393,12 +415,59 @@ static void amd_e400_idle(void) * called with interrupts disabled. */ local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + tick_broadcast_exit(); local_irq_enable(); } else default_idle(); } +/* + * Intel Core2 and older machines prefer MWAIT over HALT for C1. + * We can't rely on cpuidle installing MWAIT, because it will not load + * on systems that support only C1 -- so the boot default must be MWAIT. + * + * Some AMD machines are the opposite, they depend on using HALT. + * + * So for default C1, which is used during boot until cpuidle loads, + * use MWAIT-C1 on Intel HW that has it, else use HALT. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (!cpu_has(c, X86_FEATURE_MWAIT)) + return 0; + + return 1; +} + +/* + * MONITOR/MWAIT with no hints, used for default default C1 state. + * This invokes MWAIT with interrutps enabled and no flags, + * which is backwards compatible with the original MWAIT implementation. + */ + +static void mwait_idle(void) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + smp_mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); + smp_mb(); /* quirk */ + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else { + local_irq_enable(); + } + __current_clr_polling(); +} + void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + x86_idle = mwait_idle; } else x86_idle = default_idle; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..8ed2106b06da 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long sp; unsigned short ss, gs; - if (user_mode_vm(regs)) { + if (user_mode(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; gs = get_user_gs(regs); @@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ip = new_ip; regs->sp = new_sp; regs->flags = X86_EFLAGS_IF; - /* - * force it to the iret return path by making it look as if there was - * some work pending. - */ - set_thread_flag(TIF_NOTIFY_RESUME); + force_iret(); } EXPORT_SYMBOL_GPL(start_thread); @@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* - * Reload esp0. - */ - load_sp0(tss, next); - - /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this @@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); + /* + * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * current_thread_info(). + */ + load_sp0(tss, next); this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43577d2..4baaa972f52a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -__visible DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); p->thread.io_bitmap_ptr = NULL; @@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) + if (is_ia32_task()) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); else @@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); - current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; + force_iret(); } void @@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; fpu_switch_t fpu; fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* Reload esp0 and ss1. */ - load_sp0(tss, next); - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = this_cpu_read(old_rsp); - this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); /* @@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + /* Reload esp0 and ss1. This changes current_thread_info(). */ + load_sp0(tss, next); + this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE); /* * Now maybe reload the debug registers and handle I/O bitmaps @@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr) unsigned long KSTK_ESP(struct task_struct *task) { - return (test_tsk_thread_flag(task, TIF_IA32)) ? - (task_pt_regs(task)->sp) : ((task)->thread.usersp); + return task_pt_regs(task)->sp; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618b2e91..a7bc79480719 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->cs = value; -#endif + task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->ss = value; -#endif + task_pt_regs(task)->ss = value; break; } @@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, memset(info, 0, sizeof(*info)); info->si_signo = SIGTRAP; info->si_code = si_code; - info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; + info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; } void user_single_step_siginfo(struct task_struct *tsk, diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a58..e5ecd20e72dd 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + #ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + pvclock_vdso_info = i; + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } + + register_task_migration_notifier(&pvclock_migrate); + return 0; } #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index bae6c609888e..86db4bcd7ce5 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -183,6 +183,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* ASRock */ + { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ + .callback = set_pci_reboot, + .ident = "ASRock Q1900DC-ITX", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"), + DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"), + }, + }, + /* ASUS */ { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7c22a6..77630d57e7bf 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -226,23 +226,23 @@ swap_pages: movl (%ebx), %ecx addl $4, %ebx 1: - testl $0x1, %ecx /* is it a destination page */ + testb $0x1, %cl /* is it a destination page */ jz 2f movl %ecx, %edi andl $0xfffff000, %edi jmp 0b 2: - testl $0x2, %ecx /* is it an indirection page */ + testb $0x2, %cl /* is it an indirection page */ jz 2f movl %ecx, %ebx andl $0xfffff000, %ebx jmp 0b 2: - testl $0x4, %ecx /* is it the done indicator */ + testb $0x4, %cl /* is it the done indicator */ jz 2f jmp 3f 2: - testl $0x8, %ecx /* is it the source indicator */ + testb $0x8, %cl /* is it the source indicator */ jz 0b /* Ignore it otherwise */ movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c693e475..98111b38ebfd 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -123,7 +123,7 @@ identity_mapped: * Set cr4 to a known state: * - physical address extension enabled */ - movq $X86_CR4_PAE, %rax + movl $X86_CR4_PAE, %eax movq %rax, %cr4 jmp 1f @@ -221,23 +221,23 @@ swap_pages: movq (%rbx), %rcx addq $8, %rbx 1: - testq $0x1, %rcx /* is it a destination page? */ + testb $0x1, %cl /* is it a destination page? */ jz 2f movq %rcx, %rdi andq $0xfffffffffffff000, %rdi jmp 0b 2: - testq $0x2, %rcx /* is it an indirection page? */ + testb $0x2, %cl /* is it an indirection page? */ jz 2f movq %rcx, %rbx andq $0xfffffffffffff000, %rbx jmp 0b 2: - testq $0x4, %rcx /* is it the done indicator? */ + testb $0x4, %cl /* is it the done indicator? */ jz 2f jmp 3f 2: - testq $0x8, %rcx /* is it the source indicator? */ + testb $0x8, %cl /* is it the source indicator? */ jz 0b /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi @@ -246,17 +246,17 @@ swap_pages: movq %rsi, %rax movq %r10, %rdi - movq $512, %rcx + movl $512, %ecx rep ; movsq movq %rax, %rdi movq %rdx, %rsi - movq $512, %rcx + movl $512, %ecx rep ; movsq movq %rdx, %rdi movq %r10, %rsi - movq $512, %rcx + movl $512, %ecx rep ; movsq lea PAGE_SIZE(%rax), %rsi diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 98dc9317286e..d74ac33290ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -122,8 +122,6 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; -bool __read_mostly kaslr_enabled = false; - #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); #endif @@ -356,7 +354,7 @@ static void __init relocate_initrd(void) mapaddr = ramdisk_image & PAGE_MASK; p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); - early_iounmap(p, clen+slop); + early_memunmap(p, clen+slop); q += clen; ramdisk_image += clen; ramdisk_size -= clen; @@ -427,11 +425,6 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -static void __init parse_kaslr_setup(u64 pa_data, u32 data_len) -{ - kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data)); -} - static void __init parse_setup_data(void) { struct setup_data *data; @@ -445,7 +438,7 @@ static void __init parse_setup_data(void) data_len = data->len + sizeof(struct setup_data); data_type = data->type; pa_next = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); switch (data_type) { case SETUP_E820_EXT: @@ -457,9 +450,6 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; - case SETUP_KASLR: - parse_kaslr_setup(pa_data, data_len); - break; default: break; } @@ -480,7 +470,7 @@ static void __init e820_reserve_setup_data(void) E820_RAM, E820_RESERVED_KERN); found = 1; pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } if (!found) return; @@ -501,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } } @@ -842,14 +832,15 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - if (kaslr_enabled) + if (kaslr_enabled()) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", (unsigned long)&_text - __START_KERNEL, __START_KERNEL, __START_KERNEL_map, MODULES_VADDR-1); - else + } else { pr_emerg("Kernel Offset: disabled\n"); + } return 0; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..3e581865c8e2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,8 +61,7 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { void __user *buf; unsigned int tmpflags; @@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, #endif /* CONFIG_X86_32 */ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); + COPY(dx); COPY(cx); COPY(ip); COPY(ax); #ifdef CONFIG_X86_64 COPY(r8); @@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ get_user_ex(buf, &sc->fpstate); - - get_user_ex(*pax, &sc->ax); } get_user_catch(err); err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + force_iret(); + return err; } @@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, #else /* !CONFIG_X86_32 */ put_user_ex(regs->flags, &sc->flags); put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); + put_user_ex(0, &sc->__pad2); + put_user_ex(0, &sc->__pad1); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. + */ regs->cs = __USER_CS; + regs->ss = __USER_DS; return 0; } @@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; - unsigned long ax; sigset_t set; frame = (struct sigframe __user *)(regs->sp - 8); @@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc, &ax)) + if (restore_sigcontext(regs, &frame->sc)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "sigreturn"); @@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; - unsigned long ax; sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); @@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "rt_sigreturn"); @@ -679,7 +680,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * Ensure the signal handler starts with the new fpu state. */ if (used_math()) - drop_init_fpu(current); + fpu_reset_state(current); } signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } @@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; - unsigned long ax; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "x32 rt_sigreturn"); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aabc72e..7035f6b21c3f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -779,6 +779,26 @@ out: return boot_error; } +void common_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); + + per_cpu(current_task, cpu) = idle; + +#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); + per_cpu(cpu_current_top_of_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; +#else + clear_tsk_thread_flag(idle, TIF_FORK); + initial_gs = per_cpu_offset(cpu); +#endif + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -796,23 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - /* Just in case we booted with a single CPU. */ - alternatives_enable_smp(); - idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(idle))) - 1); - per_cpu(current_task, cpu) = idle; -#ifdef CONFIG_X86_32 - /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); -#else - clear_tsk_thread_flag(idle, TIF_FORK); - initial_gs = per_cpu_offset(cpu); -#endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) - - KERNEL_STACK_OFFSET + THREAD_SIZE; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; @@ -953,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ __cpu_disable_lazy_restore(cpu); + common_cpu_up(cpu, tidle); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); @@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus) return SMP_NO_APIC; } - verify_local_APIC(); - /* * If SMP should be disabled, then really disable it! */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 30277e27431a..10e0272d789a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -34,10 +34,26 @@ static unsigned long get_align_mask(void) return va_align.mask; } +/* + * To avoid aliasing in the I$ on AMD F15h, the bits defined by the + * va_align.bits, [12:upper_bit), are set to a random value instead of + * zeroing them. This random value is computed once per boot. This form + * of ASLR is known as "per-boot ASLR". + * + * To achieve this, the random value is added to the info.align_offset + * value before calling vm_unmapped_area() or ORed directly to the + * address. + */ +static unsigned long get_align_bits(void) +{ + return va_align.bits & get_align_mask(); +} + unsigned long align_vdso_addr(unsigned long addr) { unsigned long align_mask = get_align_mask(); - return (addr + align_mask) & ~align_mask; + addr = (addr + align_mask) & ~align_mask; + return addr | get_align_bits(); } static int __init control_va_addr_alignment(char *str) @@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } return vm_unmapped_area(&info); } @@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57d8a9e..3777189c4a19 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -5,21 +5,29 @@ #include <linux/cache.h> #include <asm/asm-offsets.h> -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; #include <asm/syscalls_32.h> #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25adc0e16eaa..d39c09119db6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (!user_mode_vm(regs) && in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9d2073e2ecc9..f4fa991406cd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) { enum ctx_state prev_state; - if (user_mode_vm(regs)) { + if (user_mode(regs)) { /* Other than that, we're just an exception. */ prev_state = exception_enter(); } else { @@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) /* Must be before exception_exit. */ preempt_count_sub(HARDIRQ_OFFSET); - if (user_mode_vm(regs)) + if (user_mode(regs)) return exception_exit(prev_state); else rcu_nmi_exit(); @@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) * * IST exception handlers normally cannot schedule. As a special * exception, if the exception interrupted userspace code (i.e. - * user_mode_vm(regs) would return true) and the exception was not + * user_mode(regs) would return true) and the exception was not * a double fault, it can be safe to schedule. ist_begin_non_atomic() * begins a non-atomic section within an ist_enter()/ist_exit() region. * Callers are responsible for enabling interrupts themselves inside @@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) */ void ist_begin_non_atomic(struct pt_regs *regs) { - BUG_ON(!user_mode_vm(regs)); + BUG_ON(!user_mode(regs)); /* * Sanity check: we need to be on the normal thread stack. This * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) - & ~(THREAD_SIZE - 1)) != 0); + BUG_ON((unsigned long)(current_top_of_stack() - + current_stack_pointer()) >= THREAD_SIZE); preempt_count_sub(HARDIRQ_OFFSET); } @@ -194,8 +194,7 @@ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. @@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } return -1; } -#endif + if (!user_mode(regs)) { if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; @@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code) prev_state = exception_enter(); conditional_sti(regs); -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); goto exit; } -#endif tsk = current; if (!user_mode(regs)) { @@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* Copy the remainder of the stack from the current stack. */ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); - BUG_ON(!user_mode_vm(&new_stack->regs)); + BUG_ON(!user_mode(&new_stack->regs)); return new_stack; } NOKPROBE_SYMBOL(fixup_bad_iret); @@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); preempt_conditional_cli(regs); @@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; conditional_sti(regs); - if (!user_mode_vm(regs)) + if (!user_mode(regs)) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; @@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) /* * Save the info for the exception handler and clear the error. */ - save_init_fpu(task); + unlazy_fpu(task); task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; @@ -863,7 +860,7 @@ void math_state_restore(void) kernel_fpu_disable(); __thread_fpu_begin(tsk); if (unlikely(restore_fpu_checking(tsk))) { - drop_init_fpu(tsk); + fpu_reset_state(tsk); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { tsk->thread.fpu_counter++; @@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* + * Don't use IST to set DEBUG_STACK as it doesn't work until TSS + * is ready in cpu_init() <-- trap_init(). Before trap_init(), + * CPU runs at ring 0 so it is impossible to hit an invalid + * stack. Using the original stack works well enough at this + * early stage. DEBUG_STACK will be equipped after cpu_init() in + * trap_init(). + * + * We don't need to set trace_idt_table like set_intr_gate(), + * since we don't have trace_debug and it will be reset to + * 'debug' in trap_init() by set_intr_gate_ist(). + */ + set_intr_gate_notrace(X86_TRAP_DB, debug); /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_system_intr_gate(X86_TRAP_BP, &int3); #ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, page_fault); #endif @@ -1005,6 +1014,15 @@ void __init trap_init(void) */ cpu_init(); + /* + * X86_TRAP_DB and X86_TRAP_BP have been set + * in early_trap_init(). However, ITS works only after + * cpu_init() loads TSS. See comments in early_trap_init(). + */ + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + x86_init.irqs.trap_init(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 81f8adb0679e..0b81ad67da07 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, int ret = NOTIFY_DONE; /* We are only interested in userspace traps */ - if (regs && !user_mode_vm(regs)) + if (regs && !user_mode(regs)) return NOTIFY_DONE; switch (val) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); @@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index c7d791f32b98..51e330416995 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk) gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr.cycle_last; - vdata->mask = tk->tkr.mask; - vdata->mult = tk->tkr.mult; - vdata->shift = tk->tkr.shift; + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr.xtime_nsec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr.xtime_nsec + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr.shift); + << tk->tkr_mono.shift); while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr.shift; + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; vdata->monotonic_time_sec++; } vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> - tk->tkr.shift); + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); vdata->monotonic_time_coarse_sec = vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 34f66e58a896..87a815b85f3e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - drop_init_fpu(tsk); + fpu_reset_state(tsk); return 0; } @@ -379,7 +379,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Sanitize the copied state etc. */ - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; @@ -393,14 +393,15 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ drop_fpu(tsk); - if (__copy_from_user(xsave, buf_fx, state_size) || + if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { + fpu_finit(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - set_used_math(); } + set_used_math(); if (use_eager_fpu()) { preempt_disable(); math_state_restore(); @@ -415,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ user_fpu_begin(); if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { - drop_init_fpu(tsk); + fpu_reset_state(tsk); return -1; } } @@ -677,19 +678,13 @@ void xsave_init(void) this_func(); } -static inline void __init eager_fpu_init_bp(void) -{ - current->thread.fpu.state = - alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); - if (!init_xstate_buf) - setup_init_fpu_buf(); -} - -void eager_fpu_init(void) +/* + * setup_init_fpu_buf() is __init and it is OK to call it here because + * init_xstate_buf will be unset only once during boot. + */ +void __init_refok eager_fpu_init(void) { - static __refdata void (*boot_func)(void) = eager_fpu_init_bp; - - clear_used_math(); + WARN_ON(used_math()); current_thread_info()->status = 0; if (eagerfpu == ENABLE) @@ -700,21 +695,8 @@ void eager_fpu_init(void) return; } - if (boot_func) { - boot_func(); - boot_func = NULL; - } - - /* - * This is same as math_state_restore(). But use_xsave() is - * not yet patched to use math_state_restore(). - */ - init_fpu(current); - __thread_fpu_begin(current); - if (cpu_has_xsave) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); + if (!init_xstate_buf) + setup_init_fpu_buf(); } /* diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 08f790dfadc9..16e8f962eaad 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,5 +1,5 @@ -ccflags-y += -Ivirt/kvm -Iarch/x86/kvm +ccflags-y += -Iarch/x86/kvm CFLAGS_x86.o := -I. CFLAGS_svm.o := -I. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a80737ee6e6..59b69f6a2844 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) ((best->eax & 0xff00) >> 8) != 0) return -EINVAL; + /* Update physical-address width */ + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + kvm_pmu_cpuid_update(vcpu); return 0; } @@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) } } +int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; +not_found: + return 36; +} +EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); - if (!best || best->eax < 0x80000008) - goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) - return best->eax & 0xff; -not_found: - return 36; -} -EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); - /* * If no match is found, check whether we exceed the vCPU's limit * and return the content of the highest valid _standard_ leaf instead. diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 4452eedfaedd..c3b1ad9fca81 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -20,13 +20,19 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 __user *entries); void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); +int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); + +static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.maxphyaddr; +} static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; if (!static_cpu_has(X86_FEATURE_XSAVE)) - return 0; + return false; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e0b794a84c35..630bcb0d7a04 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -248,27 +248,7 @@ struct mode_dual { struct opcode mode64; }; -/* EFLAGS bit definitions. */ -#define EFLG_ID (1<<21) -#define EFLG_VIP (1<<20) -#define EFLG_VIF (1<<19) -#define EFLG_AC (1<<18) -#define EFLG_VM (1<<17) -#define EFLG_RF (1<<16) -#define EFLG_IOPL (3<<12) -#define EFLG_NT (1<<14) -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_IF (1<<9) -#define EFLG_TF (1<<8) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a -#define EFLG_RESERVED_ONE_MASK 2 enum x86_transfer_type { X86_TRANSFER_NONE, @@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) +#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\ + X86_EFLAGS_PF|X86_EFLAGS_CF) #ifdef CONFIG_X86_64 #define ON64(x) x @@ -478,6 +459,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask) *dest = (*dest & ~mask) | (src & mask); } +static void assign_register(unsigned long *reg, u64 val, int bytes) +{ + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (bytes) { + case 1: + *(u8 *)reg = (u8)val; + break; + case 2: + *(u16 *)reg = (u16)val; + break; + case 4: + *reg = (u32)val; + break; /* 64b: zero-extend */ + case 8: + *reg = val; + break; + } +} + static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; @@ -943,6 +943,22 @@ FASTOP2(xadd); FASTOP2R(cmp, cmp_r); +static int em_bsf_c(struct x86_emulate_ctxt *ctxt) +{ + /* If src is zero, do not writeback, but update flags */ + if (ctxt->src.val == 0) + ctxt->dst.type = OP_NONE; + return fastop(ctxt, em_bsf); +} + +static int em_bsr_c(struct x86_emulate_ctxt *ctxt) +{ + /* If src is zero, do not writeback, but update flags */ + if (ctxt->src.val == 0) + ctxt->dst.type = OP_NONE; + return fastop(ctxt, em_bsr); +} + static u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; @@ -1399,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, unsigned int in_page, n; unsigned int count = ctxt->rep_prefix ? address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; - in_page = (ctxt->eflags & EFLG_DF) ? + in_page = (ctxt->eflags & X86_EFLAGS_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); @@ -1412,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, } if (ctxt->rep_prefix && (ctxt->d & String) && - !(ctxt->eflags & EFLG_DF)) { + !(ctxt->eflags & X86_EFLAGS_DF)) { ctxt->dst.data = rc->data + rc->pos; ctxt->dst.type = OP_MEM_STR; ctxt->dst.count = (rc->end - rc->pos) / size; @@ -1691,21 +1707,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, static void write_register_operand(struct operand *op) { - /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ - switch (op->bytes) { - case 1: - *(u8 *)op->addr.reg = (u8)op->val; - break; - case 2: - *(u16 *)op->addr.reg = (u16)op->val; - break; - case 4: - *op->addr.reg = (u32)op->val; - break; /* 64b: zero-extend */ - case 8: - *op->addr.reg = op->val; - break; - } + return assign_register(op->addr.reg, op->val, op->bytes); } static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) @@ -1792,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, { int rc; unsigned long val, change_mask; - int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; - change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; + change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF | + X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT | + X86_EFLAGS_AC | X86_EFLAGS_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: case X86EMUL_MODE_PROT32: case X86EMUL_MODE_PROT16: if (cpl == 0) - change_mask |= EFLG_IOPL; + change_mask |= X86_EFLAGS_IOPL; if (cpl <= iopl) - change_mask |= EFLG_IF; + change_mask |= X86_EFLAGS_IF; break; case X86EMUL_MODE_VM86: if (iopl < 3) return emulate_gp(ctxt, 0); - change_mask |= EFLG_IF; + change_mask |= X86_EFLAGS_IF; break; default: /* real mode */ - change_mask |= (EFLG_IOPL | EFLG_IF); + change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF); break; } @@ -1918,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM; + ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM; return em_push(ctxt); } @@ -1926,6 +1930,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; + u32 val; while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { @@ -1933,9 +1938,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) --reg; } - rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); + rc = emulate_pop(ctxt, &val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) break; + assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes); --reg; } return rc; @@ -1956,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) if (rc != X86EMUL_CONTINUE) return rc; - ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); + ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC); ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); rc = em_push(ctxt); @@ -2022,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) unsigned long temp_eip = 0; unsigned long temp_eflags = 0; unsigned long cs = 0; - unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | - EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | - EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ - unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; + unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF | + X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF | + X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF | + X86_EFLAGS_AC | X86_EFLAGS_ID | + X86_EFLAGS_FIXED; + unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | + X86_EFLAGS_VIP; /* TODO: Add stack limit check */ @@ -2054,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) ctxt->_eip = temp_eip; - if (ctxt->op_bytes == 4) ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); else if (ctxt->op_bytes == 2) { @@ -2063,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) } ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ - ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + ctxt->eflags |= X86_EFLAGS_FIXED; ctxt->ops->set_nmi_mask(ctxt, false); return rc; @@ -2145,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); - ctxt->eflags &= ~EFLG_ZF; + ctxt->eflags &= ~X86_EFLAGS_ZF; } else { ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | (u32) reg_read(ctxt, VCPU_REGS_RBX); - ctxt->eflags |= EFLG_ZF; + ctxt->eflags |= X86_EFLAGS_ZF; } return X86EMUL_CONTINUE; } @@ -2222,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) ctxt->src.val = ctxt->dst.orig_val; fastop(ctxt, em_cmp); - if (ctxt->eflags & EFLG_ZF) { + if (ctxt->eflags & X86_EFLAGS_ZF) { /* Success: write back to memory; no update of EAX */ ctxt->src.type = OP_NONE; ctxt->dst.val = ctxt->src.orig_val; @@ -2381,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~msr_data; - ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + ctxt->eflags |= X86_EFLAGS_FIXED; #endif } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); ctxt->_eip = (u32)msr_data; - ctxt->eflags &= ~(EFLG_VM | EFLG_IF); + ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } return X86EMUL_CONTINUE; @@ -2425,8 +2434,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); - ctxt->eflags &= ~(EFLG_VM | EFLG_IF); - cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK; + ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); + cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; if (efer & EFER_LMA) { cs.d = 0; @@ -2493,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) return emulate_gp(ctxt, 0); break; } - cs_sel |= SELECTOR_RPL_MASK; - ss_sel |= SELECTOR_RPL_MASK; + cs_sel |= SEGMENT_RPL_MASK; + ss_sel |= SEGMENT_RPL_MASK; ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); @@ -2512,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) return false; if (ctxt->mode == X86EMUL_MODE_VM86) return true; - iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; return ctxt->ops->cpl(ctxt) > iopl; } @@ -2782,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return ret; ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); - if (ret != X86EMUL_CONTINUE) - return ret; - return X86EMUL_CONTINUE; + return ret; } static int task_switch_32(struct x86_emulate_ctxt *ctxt, @@ -2954,7 +2961,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, struct operand *op) { - int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; + int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count; register_address_increment(ctxt, reg, df * op->bytes); op->addr.mem.ea = register_address(ctxt, reg); @@ -3323,7 +3330,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_vmcall(struct x86_emulate_ctxt *ctxt) +static int em_hypercall(struct x86_emulate_ctxt *ctxt) { int rc = ctxt->ops->fix_hypercall(ctxt); @@ -3395,17 +3402,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) return em_lgdt_lidt(ctxt, true); } -static int em_vmmcall(struct x86_emulate_ctxt *ctxt) -{ - int rc; - - rc = ctxt->ops->fix_hypercall(ctxt); - - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return rc; -} - static int em_lidt(struct x86_emulate_ctxt *ctxt) { return em_lgdt_lidt(ctxt, false); @@ -3504,7 +3500,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt) { u32 flags; - flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; + flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF; flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; ctxt->eflags &= ~0xffUL; @@ -3769,7 +3766,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) static const struct opcode group7_rm0[] = { N, - I(SrcNone | Priv | EmulateOnUD, em_vmcall), + I(SrcNone | Priv | EmulateOnUD, em_hypercall), N, N, N, N, N, N, }; @@ -3781,7 +3778,7 @@ static const struct opcode group7_rm1[] = { static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), + II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | Prot | Priv, stgi, check_svme), @@ -4192,7 +4189,8 @@ static const struct opcode twobyte_table[256] = { N, N, G(BitOp, group8), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), - F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), + I(DstReg | SrcMem | ModRM, em_bsf_c), + I(DstReg | SrcMem | ModRM, em_bsr_c), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), @@ -4759,9 +4757,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || (ctxt->b == 0xae) || (ctxt->b == 0xaf)) && (((ctxt->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) + ((ctxt->eflags & X86_EFLAGS_ZF) == 0)) || ((ctxt->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF)))) return true; return false; @@ -4913,7 +4911,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* All REP prefixes have the same first termination condition */ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { ctxt->eip = ctxt->_eip; - ctxt->eflags &= ~EFLG_RF; + ctxt->eflags &= ~X86_EFLAGS_RF; goto done; } } @@ -4950,7 +4948,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } } - ctxt->dst.orig_val = ctxt->dst.val; + /* Copy full 64-bit value for CMPXCHG8B. */ + ctxt->dst.orig_val64 = ctxt->dst.val64; special_insn: @@ -4962,9 +4961,9 @@ special_insn: } if (ctxt->rep_prefix && (ctxt->d & String)) - ctxt->eflags |= EFLG_RF; + ctxt->eflags |= X86_EFLAGS_RF; else - ctxt->eflags &= ~EFLG_RF; + ctxt->eflags &= ~X86_EFLAGS_RF; if (ctxt->execute) { if (ctxt->d & Fastop) { @@ -5013,7 +5012,7 @@ special_insn: rc = emulate_int(ctxt, ctxt->src.val); break; case 0xce: /* into */ - if (ctxt->eflags & EFLG_OF) + if (ctxt->eflags & X86_EFLAGS_OF) rc = emulate_int(ctxt, 4); break; case 0xe9: /* jmp rel */ @@ -5026,19 +5025,19 @@ special_insn: break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ - ctxt->eflags ^= EFLG_CF; + ctxt->eflags ^= X86_EFLAGS_CF; break; case 0xf8: /* clc */ - ctxt->eflags &= ~EFLG_CF; + ctxt->eflags &= ~X86_EFLAGS_CF; break; case 0xf9: /* stc */ - ctxt->eflags |= EFLG_CF; + ctxt->eflags |= X86_EFLAGS_CF; break; case 0xfc: /* cld */ - ctxt->eflags &= ~EFLG_DF; + ctxt->eflags &= ~X86_EFLAGS_DF; break; case 0xfd: /* std */ - ctxt->eflags |= EFLG_DF; + ctxt->eflags |= X86_EFLAGS_DF; break; default: goto cannot_emulate; @@ -5099,7 +5098,7 @@ writeback: } goto done; /* skip rip writeback */ } - ctxt->eflags &= ~EFLG_RF; + ctxt->eflags &= ~X86_EFLAGS_RF; } ctxt->eip = ctxt->_eip; @@ -5136,8 +5135,7 @@ twobyte_insn: case 0x40 ... 0x4f: /* cmov */ if (test_cc(ctxt->b, ctxt->eflags)) ctxt->dst.val = ctxt->src.val; - else if (ctxt->mode != X86EMUL_MODE_PROT64 || - ctxt->op_bytes != 4) + else if (ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x80 ... 0x8f: /* jnz rel, etc*/ diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 298781d4cfb4..4dce6f8b6129 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr) (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); } -static int pit_ioport_write(struct kvm_io_device *this, +static int pit_ioport_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *this, gpa_t addr, int len, const void *data) { struct kvm_pit *pit = dev_to_pit(this); @@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this, return 0; } -static int pit_ioport_read(struct kvm_io_device *this, +static int pit_ioport_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *this, gpa_t addr, int len, void *data) { struct kvm_pit *pit = dev_to_pit(this); @@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this, return 0; } -static int speaker_ioport_write(struct kvm_io_device *this, +static int speaker_ioport_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *this, gpa_t addr, int len, const void *data) { struct kvm_pit *pit = speaker_to_pit(this); @@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this, return 0; } -static int speaker_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int speaker_ioport_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *this, + gpa_t addr, int len, void *data) { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index dd1b16b611b0..c84990b42b5b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -3,7 +3,7 @@ #include <linux/kthread.h> -#include "iodev.h" +#include <kvm/iodev.h> struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cc31f7c06d3d..fef922ff2635 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { + memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } @@ -528,42 +529,42 @@ static int picdev_read(struct kvm_pic *s, return 0; } -static int picdev_master_write(struct kvm_io_device *dev, +static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { return picdev_write(container_of(dev, struct kvm_pic, dev_master), addr, len, val); } -static int picdev_master_read(struct kvm_io_device *dev, +static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { return picdev_read(container_of(dev, struct kvm_pic, dev_master), addr, len, val); } -static int picdev_slave_write(struct kvm_io_device *dev, +static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { return picdev_write(container_of(dev, struct kvm_pic, dev_slave), addr, len, val); } -static int picdev_slave_read(struct kvm_io_device *dev, +static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { return picdev_read(container_of(dev, struct kvm_pic, dev_slave), addr, len, val); } -static int picdev_eclr_write(struct kvm_io_device *dev, +static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), addr, len, val); } -static int picdev_eclr_read(struct kvm_io_device *dev, +static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index b1947e0f3e10..28146f03c514 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, old_irr = ioapic->irr; ioapic->irr |= mask; + if (edge) + ioapic->irr_delivered &= ~mask; if ((edge && old_irr == ioapic->irr) || (!edge && entry.fields.remote_irr)) { ret = 0; @@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.shorthand = 0; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) - ioapic->irr &= ~(1 << irq); + ioapic->irr_delivered |= 1 << irq; if (irq == RTC_GSI && line_status) { /* @@ -422,6 +424,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { int i; + struct kvm_lapic *apic = vcpu->arch.apic; for (i = 0; i < IOAPIC_NUM_PINS; i++) { union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; @@ -443,7 +446,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); spin_lock(&ioapic->lock); - if (trigger_mode != IOAPIC_LEVEL_TRIG) + if (trigger_mode != IOAPIC_LEVEL_TRIG || + kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) continue; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); @@ -471,13 +475,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, } } -bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - return test_bit(vector, ioapic->handled_vectors); -} - void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; @@ -498,8 +495,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); } -static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, - void *val) +static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, + gpa_t addr, int len, void *val) { struct kvm_ioapic *ioapic = to_ioapic(this); u32 result; @@ -541,8 +538,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, return 0; } -static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, - const void *val) +static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, + gpa_t addr, int len, const void *val) { struct kvm_ioapic *ioapic = to_ioapic(this); u32 data; @@ -597,6 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; ioapic->ioregsel = 0; ioapic->irr = 0; + ioapic->irr_delivered = 0; ioapic->id = 0; memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); @@ -654,6 +652,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) spin_lock(&ioapic->lock); memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); + state->irr &= ~ioapic->irr_delivered; spin_unlock(&ioapic->lock); return 0; } @@ -667,6 +666,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; + ioapic->irr_delivered = 0; update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); kvm_ioapic_inject_all(ioapic, state->irr); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index c2e36d934af4..ca0b0b4e6256 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -3,7 +3,7 @@ #include <linux/kvm_host.h> -#include "iodev.h" +#include <kvm/iodev.h> struct kvm; struct kvm_vcpu; @@ -77,6 +77,7 @@ struct kvm_ioapic { struct rtc_status rtc_status; struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; + u32 irr_delivered; }; #ifdef DEBUG @@ -97,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } +static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + return test_bit(vector, ioapic->handled_vectors); +} + void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); -bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2d03568e9498..ad68c73008c5 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -27,7 +27,7 @@ #include <linux/kvm_host.h> #include <linux/spinlock.h> -#include "iodev.h" +#include <kvm/iodev.h> #include "ioapic.h" #include "lapic.h" diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e55b5fc344eb..d67206a7b99a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -133,6 +133,28 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } +/* The logical map is definitely wrong if we have multiple + * modes at the same time. (Physical map is always right.) + */ +static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) +{ + return !(map->mode & (map->mode - 1)); +} + +static inline void +apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) +{ + unsigned lid_bits; + + BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); + BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); + BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); + lid_bits = map->mode; + + *cid = dest_id >> lid_bits; + *lid = dest_id & ((1 << lid_bits) - 1); +} + static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -146,48 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm) if (!new) goto out; - new->ldr_bits = 8; - /* flat mode is default */ - new->cid_shift = 8; - new->cid_mask = 0; - new->lid_mask = 0xff; - new->broadcast = APIC_BROADCAST; - - kvm_for_each_vcpu(i, vcpu, kvm) { - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!kvm_apic_present(vcpu)) - continue; - - if (apic_x2apic_mode(apic)) { - new->ldr_bits = 32; - new->cid_shift = 16; - new->cid_mask = new->lid_mask = 0xffff; - new->broadcast = X2APIC_BROADCAST; - } else if (kvm_apic_get_reg(apic, APIC_LDR)) { - if (kvm_apic_get_reg(apic, APIC_DFR) == - APIC_DFR_CLUSTER) { - new->cid_shift = 4; - new->cid_mask = 0xf; - new->lid_mask = 0xf; - } else { - new->cid_shift = 8; - new->cid_mask = 0; - new->lid_mask = 0xff; - } - } - - /* - * All APICs have to be configured in the same mode by an OS. - * We take advatage of this while building logical id loockup - * table. After reset APICs are in software disabled mode, so if - * we find apic with different setting we assume this is the mode - * OS wants all apics to be in; build lookup table accordingly. - */ - if (kvm_apic_sw_enabled(apic)) - break; - } - kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; u16 cid, lid; @@ -198,11 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm) aid = kvm_apic_id(apic); ldr = kvm_apic_get_reg(apic, APIC_LDR); - cid = apic_cluster_id(new, ldr); - lid = apic_logical_id(new, ldr); if (aid < ARRAY_SIZE(new->phys_map)) new->phys_map[aid] = apic; + + if (apic_x2apic_mode(apic)) { + new->mode |= KVM_APIC_MODE_X2APIC; + } else if (ldr) { + ldr = GET_APIC_LOGICAL_ID(ldr); + if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) + new->mode |= KVM_APIC_MODE_XAPIC_FLAT; + else + new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; + } + + if (!kvm_apic_logical_map_valid(new)) + continue; + + apic_logical_id(new, ldr, &cid, &lid); + if (lid && cid < ARRAY_SIZE(new->logical_map)) new->logical_map[cid][ffs(lid) - 1] = apic; } @@ -588,15 +582,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) apic_update_ppr(apic); } -static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { - return dest == (apic_x2apic_mode(apic) ? - X2APIC_BROADCAST : APIC_BROADCAST); + if (apic_x2apic_mode(apic)) + return mda == X2APIC_BROADCAST; + + return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; } -static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) { - return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); + if (kvm_apic_broadcast(apic, mda)) + return true; + + if (apic_x2apic_mode(apic)) + return mda == kvm_apic_id(apic); + + return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -613,6 +615,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); + mda = GET_APIC_DEST_FIELD(mda); switch (kvm_apic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: @@ -627,10 +630,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } +/* KVM APIC implementation has two quirks + * - dest always begins at 0 while xAPIC MDA has offset 24, + * - IOxAPIC messages have to be delivered (directly) to x2APIC. + */ +static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, + struct kvm_lapic *target) +{ + bool ipi = source != NULL; + bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); + + if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + return X2APIC_BROADCAST; + + return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); +} + bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; + u32 mda = kvm_apic_mda(dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -640,9 +660,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, switch (short_hand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) - return kvm_apic_match_physical_addr(target, dest); + return kvm_apic_match_physical_addr(target, mda); else - return kvm_apic_match_logical_addr(target, dest); + return kvm_apic_match_logical_addr(target, mda); case APIC_DEST_SELF: return target == source; case APIC_DEST_ALLINC: @@ -664,6 +684,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic **dst; int i; bool ret = false; + bool x2apic_ipi = src && apic_x2apic_mode(src); *r = -1; @@ -675,15 +696,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (irq->shorthand) return false; + if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) + return false; + rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); if (!map) goto out; - if (irq->dest_id == map->broadcast) - goto out; - ret = true; if (irq->dest_mode == APIC_DEST_PHYSICAL) { @@ -692,16 +713,20 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, dst = &map->phys_map[irq->dest_id]; } else { - u32 mda = irq->dest_id << (32 - map->ldr_bits); - u16 cid = apic_cluster_id(map, mda); + u16 cid; + + if (!kvm_apic_logical_map_valid(map)) { + ret = false; + goto out; + } + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); if (cid >= ARRAY_SIZE(map->logical_map)) goto out; dst = map->logical_map[cid]; - bitmap = apic_logical_id(map, mda); - if (irq->delivery_mode == APIC_DM_LOWEST) { int l = -1; for_each_set_bit(i, &bitmap, 16) { @@ -833,8 +858,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; @@ -1038,7 +1062,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) addr < apic->base_address + LAPIC_MMIO_LENGTH; } -static int apic_mmio_read(struct kvm_io_device *this, +static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, void *data) { struct kvm_lapic *apic = to_lapic(this); @@ -1358,7 +1382,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) return ret; } -static int apic_mmio_write(struct kvm_io_device *this, +static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) { struct kvm_lapic *apic = to_lapic(this); @@ -1498,8 +1522,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) return; } - if (!kvm_vcpu_is_bsp(apic->vcpu)) - value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; /* update jump label if enable bit changes */ @@ -1572,7 +1594,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); - apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); + apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1782,7 +1804,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? + apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (kvm_x86_ops->hwapic_irr_update) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0bc6c656625b..9d28383fc1e7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -1,7 +1,7 @@ #ifndef __KVM_X86_LAPIC_H #define __KVM_X86_LAPIC_H -#include "iodev.h" +#include <kvm/iodev.h> #include <linux/kvm_host.h> @@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm) return kvm_x86_ops->vm_has_apicv(kvm); } -static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) -{ - u16 cid; - ldr >>= 32 - map->ldr_bits; - cid = (ldr >> map->cid_shift) & map->cid_mask; - - return cid; -} - -static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) -{ - ldr >>= (32 - map->ldr_bits); - return ldr & map->lid_mask; -} - static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { return vcpu->arch.apic->pending_events; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cee759299a35..146f295ee322 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } +static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + int need_tlb_flush = 0; + pfn_t pfn; + struct kvm_mmu_page *sp; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + sp = page_header(__pa(sptep)); + pfn = spte_to_pfn(*sptep); + + /* + * Only EPT supported for now; otherwise, one would need to + * find out efficiently whether the guest page tables are + * also using huge pages. + */ + if (sp->role.direct && + !kvm_is_reserved_pfn(pfn) && + PageTransCompound(pfn_to_page(pfn))) { + drop_spte(kvm, sptep); + sptep = rmap_get_first(*rmapp, &iter); + need_tlb_flush = 1; + } else + sptep = rmap_get_next(&iter); + } + + return need_tlb_flush; +} + +void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + bool flush = false; + unsigned long *rmapp; + unsigned long last_index, index; + gfn_t gfn_start, gfn_end; + + spin_lock(&kvm->mmu_lock); + + gfn_start = memslot->base_gfn; + gfn_end = memslot->base_gfn + memslot->npages - 1; + + if (gfn_start >= gfn_end) + goto out; + + rmapp = memslot->arch.rmap[0]; + last_index = gfn_to_index(gfn_end, memslot->base_gfn, + PT_PAGE_TABLE_LEVEL); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + +out: + spin_unlock(&kvm->mmu_lock); +} + void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8e6b7d869d2f..29fbf9dfdc54 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping { }; /* mapping between fixed pmc index and arch_events array */ -int fixed_pmc_events[] = {1, 0, 7}; +static int fixed_pmc_events[] = {1, 0, 7}; static bool pmc_is_gp(struct kvm_pmc *pmc) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d319e0c24758..ce741b8650f6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_bsp(&svm->vcpu)) + if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; svm_init_osvw(&svm->vcpu); @@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm) static int halt_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; - skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); kvm_emulate_hypercall(&svm->vcpu); return 1; } @@ -2757,11 +2755,11 @@ static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], - vcpu->arch.regs[VCPU_REGS_RAX]); + trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), + kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ - kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); + kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); @@ -2770,12 +2768,18 @@ static int invlpga_interception(struct vcpu_svm *svm) static int skinit_interception(struct vcpu_svm *svm) { - trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); + trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } +static int wbinvd_interception(struct vcpu_svm *svm) +{ + kvm_emulate_wbinvd(&svm->vcpu); + return 1; +} + static int xsetbv_interception(struct vcpu_svm *svm) { u64 new_bv = kvm_read_edx_eax(&svm->vcpu); @@ -2902,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm) return 1; } -bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) +static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, + unsigned long val) { unsigned long cr0 = svm->vcpu.arch.cr0; bool ret = false; @@ -2940,7 +2945,10 @@ static int cr_interception(struct vcpu_svm *svm) return emulate_on_interception(svm); reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; - cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; + if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) + cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; + else + cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; err = 0; if (cr >= 16) { /* mov to cr */ @@ -3133,7 +3141,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) static int rdmsr_interception(struct vcpu_svm *svm) { - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); u64 data; if (svm_get_msr(&svm->vcpu, ecx, &data)) { @@ -3142,8 +3150,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) } else { trace_kvm_msr_read(ecx, data); - svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; - svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff); + kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } @@ -3246,9 +3254,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) static int wrmsr_interception(struct vcpu_svm *svm) { struct msr_data msr; - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u64 data = kvm_read_edx_eax(&svm->vcpu); msr.data = data; msr.index = ecx; @@ -3325,7 +3332,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = cr_interception, [SVM_EXIT_READ_CR4] = cr_interception, [SVM_EXIT_READ_CR8] = cr_interception, - [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, + [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, [SVM_EXIT_WRITE_CR0] = cr_interception, [SVM_EXIT_WRITE_CR3] = cr_interception, [SVM_EXIT_WRITE_CR4] = cr_interception, @@ -3376,7 +3383,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, - [SVM_EXIT_WBINVD] = emulate_on_interception, + [SVM_EXIT_WBINVD] = wbinvd_interception, [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, @@ -3555,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); + WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3649,11 +3656,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_hwapic_isr_update(struct kvm *kvm, int isr) -{ - return; -} - static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) { return; @@ -4403,7 +4405,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, - .hwapic_isr_update = svm_hwapic_isr_update, .sync_pir_to_irr = svm_sync_pir_to_irr, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 14c1a18d206a..f5e8dce8046c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { unsigned long *msr_bitmap; - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_guest_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_nested; + else if (irqchip_in_kernel(vcpu->kvm) && + apic_x2apic_mode(vcpu->arch.apic)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -2467,6 +2470,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_low = 0; vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2476,8 +2480,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_ENABLE_EPT; vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; @@ -2491,6 +2494,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + if (enable_unrestricted_guest) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, vmx->nested.nested_vmx_misc_low, @@ -3262,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, * default value. */ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - save->selector &= ~SELECTOR_RPL_MASK; - save->dpl = save->selector & SELECTOR_RPL_MASK; + save->selector &= ~SEGMENT_RPL_MASK; + save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } vmx_set_segment(vcpu, save, seg); @@ -3836,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) unsigned int cs_rpl; vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - cs_rpl = cs.selector & SELECTOR_RPL_MASK; + cs_rpl = cs.selector & SEGMENT_RPL_MASK; if (cs.unusable) return false; @@ -3864,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu) unsigned int ss_rpl; vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - ss_rpl = ss.selector & SELECTOR_RPL_MASK; + ss_rpl = ss.selector & SEGMENT_RPL_MASK; if (ss.unusable) return true; @@ -3886,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) unsigned int rpl; vmx_get_segment(vcpu, &var, seg); - rpl = var.selector & SELECTOR_RPL_MASK; + rpl = var.selector & SEGMENT_RPL_MASK; if (var.unusable) return true; @@ -3913,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu) if (tr.unusable) return false; - if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; @@ -3931,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) if (ldtr.unusable) return true; - if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (ldtr.type != 2) return false; @@ -3948,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - return ((cs.selector & SELECTOR_RPL_MASK) == - (ss.selector & SELECTOR_RPL_MASK)); + return ((cs.selector & SEGMENT_RPL_MASK) == + (ss.selector & SEGMENT_RPL_MASK)); } /* @@ -4367,6 +4374,18 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) return 0; } +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + return true; + } +#endif + return false; +} + static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { @@ -4375,9 +4394,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* the PIR and ON have been set by L1. */ - if (vcpu->mode == IN_GUEST_MODE) - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); + kvm_vcpu_trigger_posted_interrupt(vcpu); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. @@ -4409,12 +4426,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) r = pi_test_and_set_on(&vmx->pi_desc); kvm_make_request(KVM_REQ_EVENT, vcpu); -#ifdef CONFIG_SMP - if (!r && (vcpu->mode == IN_GUEST_MODE)) - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); - else -#endif + if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) kvm_vcpu_kick(vcpu); } @@ -4700,7 +4712,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_bsp(&vmx->vcpu)) + if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) apic_base_msr.data |= MSR_IA32_APICBASE_BSP; apic_base_msr.host_initiated = true; kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); @@ -4995,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); + return kvm_vcpu_halt(vcpu); } return 1; } @@ -5060,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_invalid_opcode(intr_info)) { + if (is_guest_mode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); @@ -5079,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; + vcpu->run->internal.data[2] = error_code; return 0; } @@ -5522,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) static int handle_halt(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu); } static int handle_vmcall(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); kvm_emulate_hypercall(vcpu); return 1; } @@ -5559,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) static int handle_wbinvd(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); kvm_emulate_wbinvd(vcpu); return 1; } @@ -5817,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); return 1; } @@ -5898,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - ret = kvm_emulate_halt(vcpu); + ret = kvm_vcpu_halt(vcpu); goto out; } @@ -7307,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, else if (port < 0x10000) bitmap = vmcs12->io_bitmap_b; else - return 1; + return true; bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) - return 1; + return true; if (b & (1 << (port & 7))) - return 1; + return true; port++; size--; last_bitmap = bitmap; } - return 0; + return false; } /* @@ -7337,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return 1; + return true; /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, @@ -7356,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, if (msr_index < 1024*8) { unsigned char b; if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) - return 1; + return true; return 1 & (b >> (msr_index & 7)); } else - return 1; /* let L1 handle the wrong parameter */ + return true; /* let L1 handle the wrong parameter */ } /* @@ -7381,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, case 0: if (vmcs12->cr0_guest_host_mask & (val ^ vmcs12->cr0_read_shadow)) - return 1; + return true; break; case 3: if ((vmcs12->cr3_target_count >= 1 && @@ -7392,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, vmcs12->cr3_target_value2 == val) || (vmcs12->cr3_target_count >= 4 && vmcs12->cr3_target_value3 == val)) - return 0; + return false; if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) - return 1; + return true; break; case 4: if (vmcs12->cr4_guest_host_mask & (vmcs12->cr4_read_shadow ^ val)) - return 1; + return true; break; case 8: if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) - return 1; + return true; break; } break; case 2: /* clts */ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && (vmcs12->cr0_read_shadow & X86_CR0_TS)) - return 1; + return true; break; case 1: /* mov from cr */ switch (cr) { case 3: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR3_STORE_EXITING) - return 1; + return true; break; case 8: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR8_STORE_EXITING) - return 1; + return true; break; } break; @@ -7433,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, */ if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) - return 1; + return true; if ((vmcs12->cr0_guest_host_mask & 0x1) && !(vmcs12->cr0_read_shadow & 0x1) && (val & 0x1)) - return 1; + return true; break; } - return 0; + return false; } /* @@ -7463,48 +7477,48 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) KVM_ISA_VMX); if (vmx->nested.nested_run_pending) - return 0; + return false; if (unlikely(vmx->fail)) { pr_info_ratelimited("%s failed vm entry %x\n", __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); - return 1; + return true; } switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: if (!is_exception(intr_info)) - return 0; + return false; else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) - return 0; + return false; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: - return 0; + return false; case EXIT_REASON_TRIPLE_FAULT: - return 1; + return true; case EXIT_REASON_PENDING_INTERRUPT: return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); case EXIT_REASON_TASK_SWITCH: - return 1; + return true; case EXIT_REASON_CPUID: if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return 0; - return 1; + return false; + return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: - return 1; + return true; case EXIT_REASON_INVLPG: return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); - case EXIT_REASON_RDTSC: + case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: @@ -7516,7 +7530,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! */ - return 1; + return true; case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: @@ -7527,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: - return 1; + return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_INSTRUCTION: @@ -7537,7 +7551,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: - return 0; + return false; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: @@ -7546,7 +7560,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_APIC_WRITE: case EXIT_REASON_EOI_INDUCED: /* apic_write and eoi_induced should exit unconditionally. */ - return 1; + return true; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is @@ -7554,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * missing in the guest EPT table (EPT12), the EPT violation * will be injected with nested_ept_inject_page_fault() */ - return 0; + return false; case EXIT_REASON_EPT_MISCONFIG: /* * L2 never uses directly L1's EPT, but rather L0's own EPT @@ -7562,11 +7576,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * (EPT on EPT). So any problems with the structure of the * table is L0's fault. */ - return 0; + return false; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: - return 1; + return true; case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: /* * This should never happen, since it is not possible to @@ -7576,7 +7590,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); default: - return 1; + return true; } } @@ -8511,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) exec_control); } } + if (nested && !vmx->rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; } /* Exposing INVPCID only when PCID is exposed */ @@ -8611,10 +8628,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* TODO: Also verify bits beyond physical address width are 0 */ - if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) + if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || + vmcs12->apic_access_addr >> maxphyaddr) return false; /* @@ -8630,8 +8648,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - /* TODO: Also verify bits beyond physical address width are 0 */ - if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) + if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || + vmcs12->virtual_apic_page_addr >> maxphyaddr) return false; if (vmx->nested.virtual_apic_page) /* shouldn't happen */ @@ -8654,7 +8672,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, } if (nested_cpu_has_posted_intr(vmcs12)) { - if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) + if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || + vmcs12->posted_intr_desc_addr >> maxphyaddr) return false; if (vmx->nested.pi_desc_page) { /* shouldn't happen */ @@ -8853,9 +8872,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, - unsigned long addr_field, - int maxphyaddr) + unsigned long addr_field) { + int maxphyaddr; u64 count, addr; if (vmcs12_read_any(vcpu, count_field, &count) || @@ -8865,6 +8884,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, } if (count == 0) return 0; + maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { pr_warn_ratelimited( @@ -8878,19 +8898,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - if (vmcs12->vm_exit_msr_load_count == 0 && vmcs12->vm_exit_msr_store_count == 0 && vmcs12->vm_entry_msr_load_count == 0) return 0; /* Fast path */ - maxphyaddr = cpuid_maxphyaddr(vcpu); if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, - VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || + VM_EXIT_MSR_LOAD_ADDR) || nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, - VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || + VM_EXIT_MSR_STORE_ADDR) || nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, - VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) + VM_ENTRY_MSR_LOAD_ADDR)) return -EINVAL; return 0; } @@ -9140,8 +9157,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -9213,9 +9231,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) { - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested)); + exec_control & CPU_BASED_USE_MSR_BITMAPS) { + nested_vmx_merge_msr_bitmap(vcpu, vmcs12); + /* MSR_BITMAP will be set by following vmx_set_efer. */ } else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; @@ -9374,7 +9392,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -9513,7 +9530,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12->launch_state = 1; if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) - return kvm_emulate_halt(vcpu); + return kvm_vcpu_halt(vcpu); vmx->nested.nested_run_pending = 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd7a70be41b3..e1a81267f3f6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr0123(struct kvm_vcpu *vcpu) +{ + int i; + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; + } +} + static void kvm_update_dr6(struct kvm_vcpu *vcpu) { if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) @@ -1070,19 +1081,19 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = &pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); + boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->tkr.cycle_last; - vdata->clock.mask = tk->tkr.mask; - vdata->clock.mult = tk->tkr.mult; - vdata->clock.shift = tk->tkr.shift; + vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->tkr_mono.cycle_last; + vdata->clock.mask = tk->tkr_mono.mask; + vdata->clock.mult = tk->tkr_mono.mult; + vdata->clock.shift = tk->tkr_mono.shift; vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr.xtime_nsec; + vdata->nsec_base = tk->tkr_mono.xtime_nsec; write_seqcount_end(&vdata->seq); } @@ -2744,7 +2755,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: @@ -3150,6 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return -EINVAL; memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; @@ -4115,8 +4126,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, do { n = min(len, 8); if (!(vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) break; handled += n; addr += n; @@ -4135,8 +4146,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) do { n = min(len, 8); if (!(vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, + addr, n, v)) + && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); handled += n; @@ -4476,7 +4488,8 @@ mmio: return X86EMUL_CONTINUE; } -int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, +static int emulator_read_write(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, const struct read_write_emulator_ops *ops) @@ -4539,7 +4552,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, exception, &read_emultor); } -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, +static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *val, unsigned int bytes, @@ -4630,10 +4643,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); return r; @@ -4706,7 +4719,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4723,19 +4736,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) wbinvd(); return X86EMUL_CONTINUE; } + +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_emulate_wbinvd_noskip(vcpu); +} EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); + + static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) { - kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); + kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest) { return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long value) { return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); @@ -5817,7 +5840,7 @@ void kvm_arch_exit(void) free_percpu(shared_msrs); } -int kvm_emulate_halt(struct kvm_vcpu *vcpu) +int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; if (irqchip_in_kernel(vcpu->kvm)) { @@ -5828,6 +5851,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) return 0; } } +EXPORT_SYMBOL_GPL(kvm_vcpu_halt); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_vcpu_halt(vcpu); +} EXPORT_SYMBOL_GPL(kvm_emulate_halt); int kvm_hv_hypercall(struct kvm_vcpu *vcpu) @@ -5904,7 +5934,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.dest_id = apicid; lapic_irq.delivery_mode = APIC_DM_REMRD; - kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); + kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -5912,6 +5942,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit, r = 1; + kvm_x86_ops->skip_emulated_instruction(vcpu); + if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -6165,7 +6197,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, } /* - * Returns 1 to let __vcpu_run() continue the guest execution loop without + * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the * userspace. */ @@ -6302,6 +6334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } trace_kvm_entry(vcpu->vcpu_id); @@ -6383,42 +6416,47 @@ out: return r; } +static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) +{ + if (!kvm_arch_vcpu_runnable(vcpu)) { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_block(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + return 1; + } + + kvm_apic_accept_events(vcpu); + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.apf.halted = false; + break; + case KVM_MP_STATE_INIT_RECEIVED: + break; + default: + return -EINTR; + break; + } + return 1; +} -static int __vcpu_run(struct kvm_vcpu *vcpu) +static int vcpu_run(struct kvm_vcpu *vcpu) { int r; struct kvm *kvm = vcpu->kvm; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - r = 1; - while (r > 0) { + for (;;) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) r = vcpu_enter_guest(vcpu); - else { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { - kvm_apic_accept_events(vcpu); - switch(vcpu->arch.mp_state) { - case KVM_MP_STATE_HALTED: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; - break; - case KVM_MP_STATE_INIT_RECEIVED: - break; - default: - r = -EINTR; - break; - } - } - } - + else + r = vcpu_block(kvm, vcpu); if (r <= 0) break; @@ -6430,6 +6468,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; + break; } kvm_check_async_pf_completion(vcpu); @@ -6438,6 +6477,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; + break; } if (need_resched()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); @@ -6569,7 +6609,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - r = __vcpu_run(vcpu); + r = vcpu_run(vcpu); out: post_kvm_run_save(vcpu); @@ -7076,11 +7116,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = DR6_INIT; kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); + vcpu->arch.cr2 = 0; + kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; vcpu->arch.st.msr_val = 0; @@ -7241,7 +7284,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -7289,6 +7332,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); @@ -7429,7 +7474,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvm_kvfree(free->arch.rmap[i]); + kvfree(free->arch.rmap[i]); free->arch.rmap[i] = NULL; } if (i == 0) @@ -7437,7 +7482,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, if (!dont || free->arch.lpage_info[i - 1] != dont->arch.lpage_info[i - 1]) { - kvm_kvfree(free->arch.lpage_info[i - 1]); + kvfree(free->arch.lpage_info[i - 1]); free->arch.lpage_info[i - 1] = NULL; } } @@ -7491,12 +7536,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, out_free: for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvm_kvfree(slot->arch.rmap[i]); + kvfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; if (i == 0) continue; - kvm_kvfree(slot->arch.lpage_info[i - 1]); + kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; @@ -7619,6 +7664,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, new = id_to_memslot(kvm->memslots, mem->slot); /* + * Dirty logging tracks sptes in 4k granularity, meaning that large + * sptes have to be split. If live migration is successful, the guest + * in the source machine will be destroyed and large sptes will be + * created in the destination. However, if the guest continues to run + * in the source machine (for example if live migration fails), small + * sptes will remain around and cause bad performance. + * + * Scan sptes if dirty logging has been stopped, dropping those + * which can be collapsed into a single large-page spte. Later + * page faults will create the large-page sptes. + */ + if ((change != KVM_MR_DELETE) && + (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_zap_collapsible_sptes(kvm, new); + + /* * Set up write protection and/or dirty logging for the new slot. * * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 4a0890f815c4..08f41caada45 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST_GUEST bool "Lguest guest support" - depends on X86_32 && PARAVIRT + depends on X86_32 && PARAVIRT && PCI select TTY select VIRTUALIZATION select VIRTIO @@ -8,7 +8,7 @@ config LGUEST_GUEST help Lguest is a tiny in-kernel hypervisor. Selecting this will allow your kernel to boot under lguest. This option will increase - your kernel size by about 6k. If in doubt, say N. + your kernel size by about 10k. If in doubt, say N. If you say Y here, make sure you say Y (or M) to the virtio block and net drivers which lguest needs. diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..717908b16037 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void) /* Some systems map "vectors" to interrupts weirdly. Not us! */ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, irq_entries_start + + 8 * (i - FIRST_EXTERNAL_VECTOR)); } /* @@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss, { lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, THREAD_SIZE / PAGE_SIZE); + tss->x86_tss.sp0 = thread->sp0; } /* Let's just say, I wouldn't do debugging under a Guest. */ diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index f5cc9eb1d51b..082a85167a5b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -13,16 +13,6 @@ #include <asm/alternative-asm.h> #include <asm/dwarf2.h> -.macro SAVE reg - pushl_cfi %\reg - CFI_REL_OFFSET \reg, 0 -.endm - -.macro RESTORE reg - popl_cfi %\reg - CFI_RESTORE \reg -.endm - .macro read64 reg movl %ebx, %eax movl %ecx, %edx @@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) CFI_STARTPROC - SAVE ebp - SAVE ebx - SAVE esi - SAVE edi + pushl_cfi_reg ebp + pushl_cfi_reg ebx + pushl_cfi_reg esi + pushl_cfi_reg edi movl %eax, %esi movl %edx, %edi @@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - RESTORE edi - RESTORE esi - RESTORE ebx - RESTORE ebp + popl_cfi_reg edi + popl_cfi_reg esi + popl_cfi_reg ebx + popl_cfi_reg ebp ret CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) @@ -104,7 +94,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) @@ -130,7 +120,7 @@ incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) CFI_STARTPROC - SAVE ebp - SAVE ebx + pushl_cfi_reg ebp + pushl_cfi_reg ebx /* these just push these two parameters on the stack */ - SAVE edi - SAVE ecx + pushl_cfi_reg edi + pushl_cfi_reg ecx movl %eax, %ebp movl %edx, %edi @@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8) 3: addl $8, %esp CFI_ADJUST_CFA_OFFSET -8 - RESTORE ebx - RESTORE ebp + popl_cfi_reg ebx + popl_cfi_reg ebp ret 4: cmpl %edx, 4(%esp) @@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index e78b8eee6615..9bc944a91274 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg esi + pushl_cfi_reg ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -127,14 +125,12 @@ ENTRY(csum_partial) 6: addl %ecx,%eax adcl $0, %eax 7: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 8f roll $8, %eax 8: - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi + popl_cfi_reg ebx + popl_cfi_reg esi ret CFI_ENDPROC ENDPROC(csum_partial) @@ -145,10 +141,8 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg esi + pushl_cfi_reg ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -251,14 +245,12 @@ ENTRY(csum_partial) addl %ebx,%eax adcl $0,%eax 80: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 90f roll $8, %eax 90: - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi + popl_cfi_reg ebx + popl_cfi_reg esi ret CFI_ENDPROC ENDPROC(csum_partial) @@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -412,12 +401,9 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi + popl_cfi_reg ebx + popl_cfi_reg esi + popl_cfi_reg edi popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC @@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 + pushl_cfi_reg ebx + pushl_cfi_reg edi + pushl_cfi_reg esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -506,12 +489,9 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebx - CFI_RESTORE ebx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,31 +1,35 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #include <asm/alternative-asm.h> /* - * Zero a page. - * rdi page - */ -ENTRY(clear_page_c) + * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is + * recommended to use this when possible and we do use them by default. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original. + */ + +/* + * Zero a page. + * %rdi - page + */ +ENTRY(clear_page) CFI_STARTPROC + + ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp clear_page_c_e", X86_FEATURE_ERMS + movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page_c) +ENDPROC(clear_page) -ENTRY(clear_page_c_e) +ENTRY(clear_page_orig) CFI_STARTPROC - movl $4096,%ecx - xorl %eax,%eax - rep stosb - ret - CFI_ENDPROC -ENDPROC(clear_page_c_e) -ENTRY(clear_page) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -45,29 +49,13 @@ ENTRY(clear_page) nop ret CFI_ENDPROC -.Lclear_page_end: -ENDPROC(clear_page) - - /* - * Some CPUs support enhanced REP MOVSB/STOSB instructions. - * It is recommended to use this when possible. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original function. - * - */ +ENDPROC(clear_page_orig) -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ -3: - .previous - .section .altinstructions,"a" - altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ - .Lclear_page_end-clear_page, 2b-1b - altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ - .Lclear_page_end-clear_page,3b-2b - .previous +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,23 +2,26 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #include <asm/alternative-asm.h> +/* + * Some CPUs run faster using the string copy instructions (sane microcode). + * It is also a lot simpler. Use this when possible. But, don't use streaming + * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the + * prefetch distance based on SMP/UP. + */ ALIGN -copy_page_rep: +ENTRY(copy_page) CFI_STARTPROC + ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_rep) - -/* - * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. - * Could vary the prefetch distance based on SMP/UP. -*/ +ENDPROC(copy_page) -ENTRY(copy_page) +ENTRY(copy_page_regs) CFI_STARTPROC subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 @@ -90,21 +93,5 @@ ENTRY(copy_page) addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret -.Lcopy_page_end: CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b - .previous +ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -8,9 +8,6 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> - -#define FIX_ALIGNMENT 1 - #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -19,33 +16,7 @@ #include <asm/asm.h> #include <asm/smap.h> -/* - * By placing feature2 after feature1 in altinstructions section, we logically - * implement: - * If CPU has feature2, jmp to alt2 is used - * else if CPU has feature1, jmp to alt1 is used - * else jmp to orig is used. - */ - .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 -0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt1-1b /* offset */ /* or alternatively to alt1 */ -3: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt2-1b /* offset */ /* or alternatively to alt2 */ - .previous - - .section .altinstructions,"a" - altinstruction_entry 0b,2b,\feature1,5,5 - altinstruction_entry 0b,3b,\feature2,5,5 - .previous - .endm - .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx @@ -67,7 +38,6 @@ _ASM_EXTABLE(100b,103b) _ASM_EXTABLE(101b,103b) -#endif .endm /* Standard copy_to_user with segment limit checking */ @@ -79,9 +49,11 @@ ENTRY(_copy_to_user) jc bad_to_user cmpq TI_addr_limit(%rax),%rcx ja bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_to_user) @@ -94,9 +66,11 @@ ENTRY(_copy_from_user) jc bad_from_user cmpq TI_addr_limit(%rax),%rcx ja bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_from_user) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 2419d5fefae3..9734182966f3 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic) /* handle last odd byte */ .Lhandle_1: - testl $1, %r10d + testb $1, %r10b jz .Lende xorl %ebx, %ebx source diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 1313ae6b478b..8f72b334aea0 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -52,6 +52,13 @@ */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { + /* + * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid + * even if the input buffer is long enough to hold them. + */ + if (buf_len > MAX_INSN_SIZE) + buf_len = MAX_INSN_SIZE; + memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; @@ -164,6 +171,12 @@ found: /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { + /* + * For VEX2, fake VEX3-like byte#2. + * Makes it easier to decode vex.W, vex.vvvv, + * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. + */ + insn->vex_prefix.bytes[2] = b2 & 0x7f; insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,12 +1,20 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> - #include <asm/cpufeature.h> #include <asm/dwarf2.h> #include <asm/alternative-asm.h> /* + * We build a jump to memcpy_orig by default which gets NOPped out on + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which + * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs + * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. + */ + +.weak memcpy + +/* * memcpy - Copy a memory block. * * Input: @@ -17,15 +25,11 @@ * Output: * rax original destination */ +ENTRY(__memcpy) +ENTRY(memcpy) + ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memcpy_erms", X86_FEATURE_ERMS -/* - * memcpy_c() - fast string ops (REP MOVSQ) based variant. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c: movq %rdi, %rax movq %rdx, %rcx shrq $3, %rcx @@ -34,29 +38,21 @@ movl %edx, %ecx rep movsb ret -.Lmemcpy_e: - .previous +ENDPROC(memcpy) +ENDPROC(__memcpy) /* - * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than - * memcpy_c. Use memcpy_c_e when possible. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: + * memcpy_erms() - enhanced fast string memcpy. This is faster and + * simpler than memcpy. Use memcpy_erms when possible. */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c_e: +ENTRY(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb ret -.Lmemcpy_e_e: - .previous - -.weak memcpy +ENDPROC(memcpy_erms) -ENTRY(__memcpy) -ENTRY(memcpy) +ENTRY(memcpy_orig) CFI_STARTPROC movq %rdi, %rax @@ -183,26 +179,4 @@ ENTRY(memcpy) .Lend: retq CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* - * Some CPUs are adding enhanced REP MOVSB/STOSB feature - * If the feature is supported, memcpy_c_e() is the first choice. - * If enhanced rep movsb copy is not available, use fast string copy - * memcpy_c() when possible. This is faster and code is simpler than - * original memcpy(). - * Otherwise, original memcpy() is used. - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - * - * Replace only beginning, memcpy is used to apply alternatives, - * so it is silly to overwrite itself with nops - reboot is the - * only outcome... - */ - .section .altinstructions, "a" - altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ - .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c - altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ - .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e - .previous +ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -5,7 +5,6 @@ * This assembly file is re-written from memmove_64.c file. * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> */ -#define _STRING_C #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeature.h> @@ -44,6 +43,8 @@ ENTRY(__memmove) jg 2f .Lmemmove_begin_forward: + ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS + /* * movsq instruction have many startup latency * so we handle small size by general register. @@ -207,21 +208,5 @@ ENTRY(__memmove) 13: retq CFI_ENDPROC - - .section .altinstr_replacement,"ax" -.Lmemmove_begin_forward_efs: - /* Forward moving data. */ - movq %rdx, %rcx - rep movsb - retq -.Lmemmove_end_forward_efs: - .previous - - .section .altinstructions,"a" - altinstruction_entry .Lmemmove_begin_forward, \ - .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ - .Lmemmove_end_forward-.Lmemmove_begin_forward, \ - .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs - .previous ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -5,19 +5,30 @@ #include <asm/cpufeature.h> #include <asm/alternative-asm.h> +.weak memset + /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is * simpler and shorter than the orignal function as well. - * + * * rdi destination - * rsi value (char) - * rdx count (bytes) - * + * rsi value (char) + * rdx count (bytes) + * * rax original destination - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c: + */ +ENTRY(memset) +ENTRY(__memset) + /* + * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended + * to use it when possible. If not available, use fast string instructions. + * + * Otherwise, use original memset function. + */ + ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memset_erms", X86_FEATURE_ERMS + movq %rdi,%r9 movq %rdx,%rcx andl $7,%edx @@ -31,8 +42,8 @@ rep stosb movq %r9,%rax ret -.Lmemset_e: - .previous +ENDPROC(memset) +ENDPROC(__memset) /* * ISO C memset - set a memory block to a byte value. This function uses @@ -45,21 +56,16 @@ * * rax original destination */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c_e: +ENTRY(memset_erms) movq %rdi,%r9 movb %sil,%al movq %rdx,%rcx rep stosb movq %r9,%rax ret -.Lmemset_e_e: - .previous - -.weak memset +ENDPROC(memset_erms) -ENTRY(memset) -ENTRY(__memset) +ENTRY(memset_orig) CFI_STARTPROC movq %rdi,%r10 @@ -134,23 +140,4 @@ ENTRY(__memset) jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs support enhanced REP MOVSB/STOSB feature. - * It is recommended to use this when possible. - * - * If enhanced REP MOVSB/STOSB feature is not available, use fast string - * instructions. - * - * Otherwise, use original memset function. - * - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - */ - .section .altinstructions,"a" - altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-__memset,.Lmemset_e-.Lmemset_c - altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e - .previous +ENDPROC(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index f6d13eefad10..3ca5218fbece 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -14,8 +14,8 @@ .macro op_safe_regs op ENTRY(\op\()_safe_regs) CFI_STARTPROC - pushq_cfi %rbx - pushq_cfi %rbp + pushq_cfi_reg rbx + pushq_cfi_reg rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi %rbp - popq_cfi %rbx + popq_cfi_reg rbp + popq_cfi_reg rbx ret 3: CFI_RESTORE_STATE @@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) CFI_STARTPROC - pushl_cfi %ebx - pushl_cfi %ebp - pushl_cfi %esi - pushl_cfi %edi + pushl_cfi_reg ebx + pushl_cfi_reg ebp + pushl_cfi_reg esi + pushl_cfi_reg edi pushl_cfi $0 /* Return value */ pushl_cfi %eax movl 4(%eax), %ecx @@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs) movl %esi, 24(%eax) movl %edi, 28(%eax) popl_cfi %eax - popl_cfi %edi - popl_cfi %esi - popl_cfi %ebp - popl_cfi %ebx + popl_cfi_reg edi + popl_cfi_reg esi + popl_cfi_reg ebp + popl_cfi_reg ebx ret 3: CFI_RESTORE_STATE diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 5dff5f042468..2322abe4da3b 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -34,10 +34,10 @@ */ #define save_common_regs \ - pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 + pushl_cfi_reg ecx #define restore_common_regs \ - popl_cfi %ecx; CFI_RESTORE ecx + popl_cfi_reg ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,22 +64,22 @@ */ #define save_common_regs \ - pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ - pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ - pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ - pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ - pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ - pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ - pushq_cfi %r11; CFI_REL_OFFSET r11, 0 + pushq_cfi_reg rdi; \ + pushq_cfi_reg rsi; \ + pushq_cfi_reg rcx; \ + pushq_cfi_reg r8; \ + pushq_cfi_reg r9; \ + pushq_cfi_reg r10; \ + pushq_cfi_reg r11 #define restore_common_regs \ - popq_cfi %r11; CFI_RESTORE r11; \ - popq_cfi %r10; CFI_RESTORE r10; \ - popq_cfi %r9; CFI_RESTORE r9; \ - popq_cfi %r8; CFI_RESTORE r8; \ - popq_cfi %rcx; CFI_RESTORE rcx; \ - popq_cfi %rsi; CFI_RESTORE rsi; \ - popq_cfi %rdi; CFI_RESTORE rdi + popq_cfi_reg r11; \ + popq_cfi_reg r10; \ + popq_cfi_reg r9; \ + popq_cfi_reg r8; \ + popq_cfi_reg rcx; \ + popq_cfi_reg rsi; \ + popq_cfi_reg rdi #endif @@ -87,12 +87,10 @@ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi) %__ASM_REG(dx) - CFI_REL_OFFSET __ASM_REG(dx), 0 + __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) - CFI_RESTORE __ASM_REG(dx) + __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) restore_common_regs ret CFI_ENDPROC @@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi) %__ASM_REG(dx) - CFI_REL_OFFSET __ASM_REG(dx), 0 + __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) - CFI_RESTORE __ASM_REG(dx) + __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) restore_common_regs ret CFI_ENDPROC diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index e28cdaf5ac2c..5eb715087b80 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -13,12 +13,9 @@ .globl \name \name: CFI_STARTPROC - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ecx + pushl_cfi_reg edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -26,12 +23,9 @@ .endif call \func - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg edx + popl_cfi_reg ecx + popl_cfi_reg eax ret CFI_ENDPROC _ASM_NOKPROBE(\name) diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index b30b5ebd614a..f89ba4e93025 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -17,9 +17,18 @@ CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - SAVE_ARGS + pushq_cfi_reg rdi + pushq_cfi_reg rsi + pushq_cfi_reg rdx + pushq_cfi_reg rcx + pushq_cfi_reg rax + pushq_cfi_reg r8 + pushq_cfi_reg r9 + pushq_cfi_reg r10 + pushq_cfi_reg r11 .if \put_ret_addr_in_rdi + /* 9*8(%rsp) is return addr on stack */ movq_cfi_restore 9*8, rdi .endif @@ -45,11 +54,22 @@ #endif #endif - /* SAVE_ARGS below is used only for the .cfi directives it contains. */ +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) CFI_STARTPROC - SAVE_ARGS + CFI_ADJUST_CFA_OFFSET 9*8 restore: - RESTORE_ARGS + popq_cfi_reg r11 + popq_cfi_reg r10 + popq_cfi_reg r9 + popq_cfi_reg r8 + popq_cfi_reg rax + popq_cfi_reg rcx + popq_cfi_reg rdx + popq_cfi_reg rsi + popq_cfi_reg rdi ret CFI_ENDPROC _ASM_NOKPROBE(restore) +#endif diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index c905e89e19fe..1f33b3d1fd68 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -69,21 +69,20 @@ EXPORT_SYMBOL(copy_in_user); * it is not necessary to optimize tail handling. */ __visible unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) +copy_user_handle_tail(char *to, char *from, unsigned len) { - char c; - unsigned zero_len; - for (; len; --len, to++) { + char c; + if (__get_user_nocheck(c, from++, sizeof(char))) break; if (__put_user_nocheck(c, to, sizeof(char))) break; } - - for (c = 0, zero_len = len; zerorest && zero_len; --zero_len) - if (__put_user_nocheck(c, to++, sizeof(char))) - break; clac(); + + /* If the destination is a kernel buffer, we always clear the end */ + if ((unsigned long)to >= TASK_SIZE_MAX) + memset(to, 0, len); return len; } diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1a2be7c6895d..816488c0b97e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -273,6 +273,9 @@ dd: ESC de: ESC df: ESC # 0xe0 - 0xef +# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix +# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation +# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. e0: LOOPNE/LOOPNZ Jb (f64) e1: LOOPE/LOOPZ Jb (f64) e2: LOOP Jb (f64) @@ -281,6 +284,10 @@ e4: IN AL,Ib e5: IN eAX,Ib e6: OUT Ib,AL e7: OUT Ib,eAX +# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset +# in "near" jumps and calls is 16-bit. For CALL, +# push of return address is 16-bit wide, RSP is decremented by 2 +# but is not truncated to 16 bits, unlike RIP. e8: CALL Jz (f64) e9: JMP-near Jz (f64) ea: JMP-far Ap (i64) @@ -456,6 +463,7 @@ AVXcode: 1 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) # 0x0f 0x80-0x8f +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 80: JO Jz (f64) 81: JNO Jz (f64) 82: JB/JC/JNAE Jz (f64) @@ -842,6 +850,7 @@ EndTable GrpTable: Grp5 0: INC Ev 1: DEC Ev +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 2: CALLN Ev (f64) 3: CALLF Ep 4: JMPN Ev (f64) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ede025fb46f1..181c53bac3a7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs) int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) return 0; while (instr < max_instr) { @@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) if (error_code & PF_USER) return false; - if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) + if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) return false; return true; @@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ - if (user_mode_vm(regs)) { + if (user_mode(regs)) { local_irq_enable(); error_code |= PF_USER; flags |= FAULT_FLAG_USER; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a110efca6d06..1d553186c434 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,29 +29,33 @@ /* * Tables translating between page_cache_type_t and pte encoding. - * Minimal supported modes are defined statically, modified if more supported - * cache modes are available. - * Index into __cachemode2pte_tbl is the cachemode. - * Index into __pte2cachemode_tbl are the caching attribute bits of the pte - * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + * + * Minimal supported modes are defined statically, they are modified + * during bootup if more supported cache modes are available. + * + * Index into __cachemode2pte_tbl[] is the cachemode. + * + * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { - [_PAGE_CACHE_MODE_WB] = 0, - [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, - [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, - [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, - [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, - [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, + [_PAGE_CACHE_MODE_WB ] = 0 | 0 , + [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , + [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); + uint8_t __pte2cachemode_tbl[8] = { - [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, - [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, - [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, - [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, - [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; EXPORT_SYMBOL(__pte2cachemode_tbl); @@ -131,21 +135,7 @@ void __init early_alloc_pgt_buf(void) int after_bootmem; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; - -static void __init init_gbpages(void) -{ -#ifdef CONFIG_X86_64 - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -#endif -} +early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); struct map_range { unsigned long start; @@ -157,16 +147,12 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { - init_gbpages(); - #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (direct_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; if (cpu_has_pse) page_size_mask |= 1 << PG_LEVEL_2M; #endif @@ -179,6 +165,15 @@ static void __init probe_page_size_mask(void) if (cpu_has_pge) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; + } else + __supported_pte_mask &= ~_PAGE_GLOBAL; + + /* Enable 1 GB linear kernel mappings if available: */ + if (direct_gbpages && cpu_has_gbpages) { + printk(KERN_INFO "Using GB pages for direct mapping\n"); + page_size_mask |= 1 << PG_LEVEL_1G; + } else { + direct_gbpages = 0; } } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30eb05ae7061..3fba623e3ba5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, return 0; } -static int __init parse_direct_gbpages_off(char *arg) -{ - direct_gbpages = 0; - return 0; -} -early_param("nogbpages", parse_direct_gbpages_off); - -static int __init parse_direct_gbpages_on(char *arg) -{ - direct_gbpages = 1; - return 0; -} -early_param("gbpages", parse_direct_gbpages_on); - /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index cd4785bbacb9..4053bb58bf92 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void) &memblock.reserved, mb->nid); } - /* Mark all kernel nodes. */ + /* + * Mark all kernel nodes. + * + * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo + * may not include all the memblock.reserved memory ranges because + * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. + */ for_each_memblock(reserved, r) - node_set(r->nid, numa_kernel_nodes); + if (r->nid != MAX_NUMNODES) + node_set(r->nid, numa_kernel_nodes); /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 536ea2fb6e33..89af288ec674 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m) seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif -#ifdef CONFIG_X86_64 if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); -#endif } #else static inline void split_page_count(int level) { } @@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } -EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } -EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7ac68698406c..35af6771a95a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } #ifdef CONFIG_STRICT_DEVMEM -/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; @@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", - current->comm, from, to - 1); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7b22adaad4f1..5a7e5252c878 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -275,12 +275,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +/* + * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also + * assumes that pgd should be in one page. + * + * But kernel with PAE paging that is not running as a Xen domain + * only needs to allocate 32 bytes for pgd instead of one page. + */ +#ifdef CONFIG_X86_PAE + +#include <linux/slab.h> + +#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +#define PGD_ALIGN 32 + +static struct kmem_cache *pgd_cache; + +static int __init pgd_cache_init(void) +{ + /* + * When PAE kernel is running as a Xen domain, it does not use + * shared kernel pmd. And this requires a whole page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return 0; + + /* + * when PAE kernel is not running as a Xen domain, it uses + * shared kernel pmd. Shared kernel pmd does not require a whole + * page for pgd. We are able to just allocate a 32-byte for pgd. + * During boot time, we create a 32-byte slab for pgd table allocation. + */ + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, + SLAB_PANIC, NULL); + if (!pgd_cache) + return -ENOMEM; + + return 0; +} +core_initcall(pgd_cache_init); + +static inline pgd_t *_pgd_alloc(void) +{ + /* + * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. + * We allocate one page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return (pgd_t *)__get_free_page(PGALLOC_GFP); + + /* + * Now PAE kernel is not running as a Xen domain. We can allocate + * a 32-byte slab for pgd to save memory space. + */ + return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + if (!SHARED_KERNEL_PMD) + free_page((unsigned long)pgd); + else + kmem_cache_free(pgd_cache, pgd); +} +#else +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_page(PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_page((unsigned long)pgd); +} +#endif /* CONFIG_X86_PAE */ + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(); if (pgd == NULL) goto out; @@ -310,7 +385,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(mm, pmds); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(pgd); out: return NULL; } @@ -320,7 +395,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(pgd); } /* diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 5d04be5efb64..4e664bdb535a 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) { struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) dump_trace(NULL, regs, (unsigned long *)stack, 0, diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 6ac273832f28..e4695985f9de 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -331,7 +331,7 @@ static void probe_pci_root_info(struct pci_root_info *info, struct list_head *list) { int ret; - struct resource_entry *entry; + struct resource_entry *entry, *tmp; sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); info->bridge = device; @@ -345,8 +345,13 @@ static void probe_pci_root_info(struct pci_root_info *info, dev_dbg(&device->dev, "no IO and memory resources present in _CRS\n"); else - resource_list_for_each_entry(entry, list) - entry->res->name = info->name; + resource_list_for_each_entry_safe(entry, tmp, list) { + if ((entry->res->flags & IORESOURCE_WINDOW) == 0 || + (entry->res->flags & IORESOURCE_DISABLED)) + resource_list_destroy_entry(entry); + else + entry->res->name = info->name; + } } struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3d2612b68694..2fb384724ebb 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -513,31 +513,6 @@ void __init pcibios_set_cache_line_size(void) } } -/* - * Some device drivers assume dev->irq won't change after calling - * pci_disable_device(). So delay releasing of IRQ resource to driver - * unbinding time. Otherwise it will break PM subsystem and drivers - * like xen-pciback etc. - */ -static int pci_irq_notifier(struct notifier_block *nb, unsigned long action, - void *data) -{ - struct pci_dev *dev = to_pci_dev(data); - - if (action != BUS_NOTIFY_UNBOUND_DRIVER) - return NOTIFY_DONE; - - if (pcibios_disable_irq) - pcibios_disable_irq(dev); - - return NOTIFY_OK; -} - -static struct notifier_block pci_irq_nb = { - .notifier_call = pci_irq_notifier, - .priority = INT_MIN, -}; - int __init pcibios_init(void) { if (!raw_pci_ops) { @@ -550,9 +525,6 @@ int __init pcibios_init(void) if (pci_bf_sort >= pci_force_bf) pci_sort_breadthfirst(); - - bus_register_notifier(&pci_bus_type, &pci_irq_nb); - return 0; } @@ -711,6 +683,12 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return 0; } +void pcibios_disable_device (struct pci_dev *dev) +{ + if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) + pcibios_disable_irq(dev); +} + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index efb849323c74..852aa4c92da0 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (dev->irq_managed && dev->irq > 0) { + if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && + dev->irq > 0) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; - dev->irq = 0; } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index e71b3dbd87b8..5dc6ca5e1741 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1256,9 +1256,22 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } +bool mp_should_keep_irq(struct device *dev) +{ + if (dev->power.is_prepared) + return true; +#ifdef CONFIG_PM + if (dev->power.runtime_status == RPM_SUSPENDING) + return true; +#endif + + return false; +} + static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) { + if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && + dev->irq_managed && dev->irq) { mp_unmap_irq(dev->irq); dev->irq = 0; dev->irq_managed = 0; diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index d143d216d52b..d7f997f7c26d 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -67,7 +67,7 @@ void __init efi_bgrt_init(void) image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { - image = early_memremap(bgrt_tab->image_address, + image = early_ioremap(bgrt_tab->image_address, sizeof(bmp_header)); ioremapped = true; if (!image) { @@ -89,7 +89,7 @@ void __init efi_bgrt_init(void) } if (ioremapped) { - image = early_memremap(bgrt_tab->image_address, + image = early_ioremap(bgrt_tab->image_address, bmp_header.size); if (!image) { pr_err("Ignoring BGRT: failed to map image memory\n"); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index dbc8627a5cdf..02744df576d5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -85,12 +85,20 @@ static efi_status_t __init phys_efi_set_virtual_address_map( efi_memory_desc_t *virtual_map) { efi_status_t status; + unsigned long flags; + pgd_t *save_pgd; - efi_call_phys_prolog(); + save_pgd = efi_call_phys_prolog(); + + /* Disable interrupts around EFI calls: */ + local_irq_save(flags); status = efi_call_phys(efi_phys.set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_call_phys_epilog(); + local_irq_restore(flags); + + efi_call_phys_epilog(save_pgd); + return status; } @@ -491,7 +499,8 @@ void __init efi_init(void) if (efi_memmap_init()) return; - print_efi_memmap(); + if (efi_enabled(EFI_DBG)) + print_efi_memmap(); } void __init efi_late_init(void) @@ -939,6 +948,8 @@ static int __init arch_parse_efi_cmdline(char *str) { if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); + if (parse_option_str(str, "debug")) + set_bit(EFI_DBG, &efi.flags); return 0; } diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 40e7cda52936..ed5b67338294 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -33,11 +33,10 @@ /* * To make EFI call EFI runtime service in physical addressing mode we need - * prolog/epilog before/after the invocation to disable interrupt, to - * claim EFI runtime service handler exclusively and to duplicate a memory in - * low memory space say 0 - 3G. + * prolog/epilog before/after the invocation to claim the EFI runtime service + * handler exclusively and to duplicate a memory mapping in low memory space, + * say 0 - 3G. */ -static unsigned long efi_rt_eflags; void efi_sync_low_kernel_mappings(void) {} void __init efi_dump_pagetable(void) {} @@ -57,21 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md) void __init efi_map_region_fixed(efi_memory_desc_t *md) {} void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} -void __init efi_call_phys_prolog(void) +pgd_t * __init efi_call_phys_prolog(void) { struct desc_ptr gdt_descr; + pgd_t *save_pgd; - local_irq_save(efi_rt_eflags); - + /* Current pgd is swapper_pg_dir, we'll restore it later: */ + save_pgd = swapper_pg_dir; load_cr3(initial_page_table); __flush_tlb_all(); gdt_descr.address = __pa(get_cpu_gdt_table(0)); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); + + return save_pgd; } -void __init efi_call_phys_epilog(void) +void __init efi_call_phys_epilog(pgd_t *save_pgd) { struct desc_ptr gdt_descr; @@ -79,10 +81,8 @@ void __init efi_call_phys_epilog(void) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); - load_cr3(swapper_pg_dir); + load_cr3(save_pgd); __flush_tlb_all(); - - local_irq_restore(efi_rt_eflags); } void __init efi_runtime_mkexec(void) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 17e80d829df0..a0ac0f9c307f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -41,9 +41,6 @@ #include <asm/realmode.h> #include <asm/time.h> -static pgd_t *save_pgd __initdata; -static unsigned long efi_flags __initdata; - /* * We allocate runtime services regions bottom-up, starting from -4G, i.e. * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. @@ -78,17 +75,18 @@ static void __init early_code_mapping_set_exec(int executable) } } -void __init efi_call_phys_prolog(void) +pgd_t * __init efi_call_phys_prolog(void) { unsigned long vaddress; + pgd_t *save_pgd; + int pgd; int n_pgds; if (!efi_enabled(EFI_OLD_MEMMAP)) - return; + return NULL; early_code_mapping_set_exec(1); - local_irq_save(efi_flags); n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); @@ -99,24 +97,29 @@ void __init efi_call_phys_prolog(void) set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); } __flush_tlb_all(); + + return save_pgd; } -void __init efi_call_phys_epilog(void) +void __init efi_call_phys_epilog(pgd_t *save_pgd) { /* * After the lock is released, the original page table is restored. */ - int pgd; - int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); + int pgd_idx; + int nr_pgds; - if (!efi_enabled(EFI_OLD_MEMMAP)) + if (!save_pgd) return; - for (pgd = 0; pgd < n_pgds; pgd++) - set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); + nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); + + for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) + set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]); + kfree(save_pgd); + __flush_tlb_all(); - local_irq_restore(efi_flags); early_code_mapping_set_exec(0); } diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 1bbedc4b0f88..3005f0c89f2e 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -130,7 +130,7 @@ static void intel_mid_arch_setup(void) intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); else { intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); - pr_info("ARCH: Uknown SoC, assuming PENWELL!\n"); + pr_info("ARCH: Unknown SoC, assuming PENWELL!\n"); } out: diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index c9a0838890e2..278e4da4222f 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -11,6 +11,7 @@ */ #include <asm-generic/sections.h> +#include <asm/cpu_device_id.h> #include <asm/imr.h> #include <linux/init.h> #include <linux/mm.h> @@ -101,6 +102,12 @@ static void __init imr_self_test(void) } } +static const struct x86_cpu_id imr_ids[] __initconst = { + { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ + {} +}; +MODULE_DEVICE_TABLE(x86cpu, imr_ids); + /** * imr_self_test_init - entry point for IMR driver. * @@ -108,7 +115,8 @@ static void __init imr_self_test(void) */ static int __init imr_self_test_init(void) { - imr_self_test(); + if (x86_match_cpu(imr_ids)) + imr_self_test(); return 0; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 994798548b1a..3b6ec42718e4 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) struct reset_args reset_args; reset_args.sender = sender; - cpus_clear(*mask); + cpumask_clear(mask); /* find a single cpu for each uvhub in this distribution mask */ maskbits = sizeof(struct pnmask) * BITSPERBYTE; /* each bit is a pnode relative to the partition base pnode */ @@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) continue; apnode = pnode + bcp->partition_base_pnode; cpu = pnode_to_first_cpu(apnode, smaster); - cpu_set(cpu, *mask); + cpumask_set_cpu(cpu, mask); } /* IPI all cpus; preemption is already disabled */ @@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, /* don't actually do a shootdown of the local cpu */ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); - if (cpu_isset(cpu, *cpumask)) + if (cpumask_test_cpu(cpu, cpumask)) stat->s_ntargself++; bau_desc = bcp->descriptor_base; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3e32ed5648a0..757678fb26e1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -134,7 +134,7 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_table(cpu); tss_desc tss; diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index b3560ece1c9f..ef8187f9d28d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -119,7 +119,7 @@ 110 i386 iopl sys_iopl 111 i386 vhangup sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old sys32_vm86_warning +113 i386 vm86old sys_vm86old sys_ni_syscall 114 i386 wait4 sys_wait4 compat_sys_wait4 115 i386 swapoff sys_swapoff 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo @@ -172,7 +172,7 @@ 163 i386 mremap sys_mremap 164 i386 setresuid sys_setresuid16 165 i386 getresuid sys_getresuid16 -166 i386 vm86 sys_vm86 sys32_vm86_warning +166 i386 vm86 sys_vm86 sys_ni_syscall 167 i386 query_module 168 i386 poll sys_poll 169 i386 nfsservctl diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 8d656fbb57aa..9ef32d5f1b19 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -178,7 +178,7 @@ 169 common reboot sys_reboot 170 common sethostname sys_sethostname 171 common setdomainname sys_setdomainname -172 common iopl stub_iopl +172 common iopl sys_iopl 173 common ioperm sys_ioperm 174 64 create_module 175 common init_module sys_init_module diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..8ffd2146fa6a 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -64,8 +64,8 @@ */ static inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5cdfa9db2217..a75d8700472a 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -16,7 +16,7 @@ */ /* Not going to be implemented by UML, since we have no hardware. */ -#define stub_iopl sys_ni_syscall +#define sys_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall /* diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 7b9be9822724..275a3a8b78af 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) -HOST_EXTRACFLAGS += -I$(srctree)/tools/include +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ @@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE PHONY += vdso_install $(vdso_img_insttargets) vdso_install: $(vdso_img_insttargets) FORCE -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 9793322751e0..40d2473836c9 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; + u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -102,20 +99,27 @@ static notrace cycle_t vread_pvclock(int *mode) * __getcpu() calls (Gleb). */ - pvti = get_pvti(cpu); + /* Make sure migrate_count will change if we leave the VCPU. */ + do { + pvti = get_pvti(cpu); + migrate_count = pvti->migrate_count; + + cpu1 = cpu; + cpu = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1)); version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. + * - We must read TSC of pvti's VCPU. + * - KVM doesn't follow the versioning protocol, so data could + * change before version if we left the VCPU. */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); + smp_rmb(); + } while (unlikely((pvti->pvti.version & 1) || + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S index 31776d0efc8c..d7ec4e251c0a 100644 --- a/arch/x86/vdso/vdso32/sigreturn.S +++ b/arch/x86/vdso/vdso32/sigreturn.S @@ -17,6 +17,7 @@ .text .globl __kernel_sigreturn .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ ALIGN __kernel_sigreturn: .LSTART_sigreturn: diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S index 5415b5613d55..6b286bb5251c 100644 --- a/arch/x86/vdso/vdso32/syscall.S +++ b/arch/x86/vdso/vdso32/syscall.S @@ -19,8 +19,6 @@ __kernel_vsyscall: .Lpush_ebp: movl %ecx, %ebp syscall - movl $__USER32_DS, %ecx - movl %ecx, %ss movl %ebp, %ecx popl %ebp .Lpop_ebp: diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bd8b8459c3d0..81665c9f2132 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss, mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); + tss->x86_tss.sp0 = thread->sp0; } static void xen_set_iopl_mask(unsigned mask) @@ -1070,6 +1071,23 @@ static inline void xen_write_cr8(unsigned long val) BUG_ON(val); } #endif + +static u64 xen_read_msr_safe(unsigned int msr, int *err) +{ + u64 val; + + val = native_read_msr_safe(msr, err); + switch (msr) { + case MSR_IA32_APICBASE: +#ifdef CONFIG_X86_X2APIC + if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) +#endif + val &= ~X2APIC_ENABLE; + break; + } + return val; +} + static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; @@ -1240,7 +1258,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, - .read_msr = native_read_msr_safe, + .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, @@ -1741,6 +1759,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #ifdef CONFIG_X86_32 i386_start_kernel(); #else + cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 740ae3026a14..b47124d4cd67 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -91,6 +91,12 @@ EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#else +#define P2M_LIMIT 0 +#endif + static DEFINE_SPINLOCK(p2m_update_lock); static unsigned long *p2m_mid_missing_mfn; @@ -385,9 +391,11 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m) void __init xen_vmalloc_p2m_tree(void) { static struct vm_struct vm; + unsigned long p2m_limit; + p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; vm.flags = VM_ALLOC; - vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn, + vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), PMD_SIZE * PMDS_PER_MID_PAGE); vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE); pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size); @@ -563,7 +571,7 @@ static bool alloc_p2m(unsigned long pfn) if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) p2m_init(p2m); else - p2m_init_identity(p2m, pfn); + p2m_init_identity(p2m, pfn & ~(P2M_PER_PAGE - 1)); spin_lock_irqsave(&p2m_update_lock, flags); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 08e8489c47f1..7413ee3706d0 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -445,15 +445,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; - per_cpu(current_task, cpu) = idle; -#ifdef CONFIG_X86_32 - irq_ctx_init(cpu); -#else - clear_tsk_thread_flag(idle, TIF_FORK); -#endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) - - KERNEL_STACK_OFFSET + THREAD_SIZE; + common_cpu_up(cpu, idle); xen_setup_runstate_info(cpu); xen_setup_timer(cpu); @@ -468,10 +460,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; - if (num_online_cpus() == 1) - /* Just in case we booted with a single CPU. */ - alternatives_enable_smp(); - rc = xen_smp_intr_init(cpu); if (rc) return rc; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index c4df9dbd63b7..d9497698645a 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,5 +1,5 @@ #include <linux/types.h> -#include <linux/clockchips.h> +#include <linux/tick.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> @@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled) static void xen_vcpu_notify_restore(void *data) { - unsigned long reason = (unsigned long)data; - /* Boot processor notified via generic timekeeping_resume() */ - if ( smp_processor_id() == 0) + if (smp_processor_id() == 0) return; - clockevents_notify(reason, NULL); + tick_resume_local(); } void xen_arch_resume(void) { - on_each_cpu(xen_vcpu_notify_restore, - (void *)CLOCK_EVT_NOTIFY_RESUME, 1); + on_each_cpu(xen_vcpu_notify_restore, NULL, 1); } diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 53adefda4275..985fc3ee0973 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -68,11 +68,11 @@ ENTRY(xen_sysret64) * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ - movq %rsp, PER_CPU_VAR(old_rsp) + movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack), %rsp pushq $__USER_DS - pushq PER_CPU_VAR(old_rsp) + pushq PER_CPU_VAR(rsp_scratch) pushq %r11 pushq $__USER_CS pushq %rcx @@ -87,11 +87,11 @@ ENTRY(xen_sysret32) * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ - movq %rsp, PER_CPU_VAR(old_rsp) + movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack), %rsp pushq $__USER32_DS - pushq PER_CPU_VAR(old_rsp) + pushq PER_CPU_VAR(rsp_scratch) pushq %r11 pushq $__USER32_CS pushq %rcx |