diff options
Diffstat (limited to 'arch/x86')
50 files changed, 1233 insertions, 1015 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0dc9d0144a27..5e28e2be3a41 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -138,6 +138,7 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select ACPI_LEGACY_TABLES_LOOKUP if ACPI select X86_FEATURE_NAMES if PROC_FS + select SRCU config INSTRUCTION_DECODER def_bool y @@ -855,6 +856,10 @@ config SCHED_MC source "kernel/Kconfig.preempt" +config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on X86_32 && !SMP && !X86_32_NON_STANDARD diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index ad754b4411f7..8bd44e8ee6e2 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -49,6 +49,7 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o $(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S index 7ff3632806b1..99494dff2113 100644 --- a/arch/x86/boot/compressed/efi_stub_64.S +++ b/arch/x86/boot/compressed/efi_stub_64.S @@ -3,28 +3,3 @@ #include <asm/processor-flags.h> #include "../../platform/efi/efi_stub_64.S" - -#ifdef CONFIG_EFI_MIXED - .code64 - .text -ENTRY(efi64_thunk) - push %rbp - push %rbx - - subq $16, %rsp - leaq efi_exit32(%rip), %rax - movl %eax, 8(%rsp) - leaq efi_gdt64(%rip), %rax - movl %eax, 4(%rsp) - movl %eax, 2(%rax) /* Fixup the gdt base address */ - leaq efi32_boot_gdt(%rip), %rax - movl %eax, (%rsp) - - call __efi64_thunk - - addq $16, %rsp - pop %rbx - pop %rbp - ret -ENDPROC(efi64_thunk) -#endif /* CONFIG_EFI_MIXED */ diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S new file mode 100644 index 000000000000..630384a4c14a --- /dev/null +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT before we make EFI serivce calls, + * since the firmware's 32-bit IDT is still currently installed and it + * needs to be able to service interrupts. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identify + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include <linux/linkage.h> +#include <asm/msr.h> +#include <asm/page_types.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + + .code64 + .text +ENTRY(efi64_thunk) + push %rbp + push %rbx + + subq $8, %rsp + leaq efi_exit32(%rip), %rax + movl %eax, 4(%rsp) + leaq efi_gdt64(%rip), %rax + movl %eax, (%rsp) + movl %eax, 2(%rax) /* Fixup the gdt base address */ + + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + /* + * Convert x86-64 ABI params to i386 ABI + */ + subq $32, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movq %r8, %rsi + movl %esi, 0xc(%rsp) + movq %r9, %rsi + movl %esi, 0x10(%rsp) + + sgdt save_gdt(%rip) + + leaq 1f(%rip), %rbx + movq %rbx, func_rt_ptr(%rip) + + /* + * Switch to gdt with 32-bit segments. This is the firmware GDT + * that was installed when the kernel started executing. This + * pointer was saved at the EFI stub entry point in head_64.S. + */ + leaq efi32_boot_gdt(%rip), %rax + lgdt (%rax) + + pushq $__KERNEL_CS + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $32, %rsp + + lgdt save_gdt(%rip) + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + + /* + * Convert 32-bit status code into 64-bit. + */ + test %rax, %rax + jz 1f + movl %eax, %ecx + andl $0x0fffffff, %ecx + andl $0xf0000000, %eax + shl $32, %rax + or %rcx, %rax +1: + addq $8, %rsp + pop %rbx + pop %rbp + ret +ENDPROC(efi64_thunk) + +ENTRY(efi_exit32) + movq func_rt_ptr(%rip), %rax + push %rax + mov %rdi, %rax + ret +ENDPROC(efi_exit32) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +ENTRY(efi_enter32) + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Reload pgtables */ + movl %cr3, %eax + movl %eax, %cr3 + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + /* + * Some firmware will return with interrupts enabled. Be sure to + * disable them before we switch GDTs. + */ + cli + + movl 56(%esp), %eax + movl %eax, 2(%eax) + lgdtl (%eax) + + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl %cr3, %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + xorl %eax, %eax + lldt %ax + + movl 60(%esp), %eax + pushl $__KERNEL_CS + pushl %eax + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + lret +ENDPROC(efi_enter32) + + .data + .balign 8 + .global efi32_boot_gdt +efi32_boot_gdt: .word 0 + .quad 0 + +save_gdt: .word 0 + .quad 0 +func_rt_ptr: .quad 0 + + .global efi_gdt64 +efi_gdt64: + .word efi_gdt64_end - efi_gdt64 + .long 0 /* Filled out by user */ + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x0080890000000000 /* TS descriptor */ + .quad 0x0000000000000000 /* TS continued */ +efi_gdt64_end: diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h index 25e13403193c..020f137df7a2 100644 --- a/arch/x86/boot/ctype.h +++ b/arch/x86/boot/ctype.h @@ -1,6 +1,5 @@ -#ifndef BOOT_ISDIGIT_H - -#define BOOT_ISDIGIT_H +#ifndef BOOT_CTYPE_H +#define BOOT_CTYPE_H static inline int isdigit(int ch) { diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index 5df2869c874b..45a07684bbab 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -2,8 +2,6 @@ #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ -#define XMTRDY 0x20 - #define DLAB 0x80 #define TXR 0 /* Transmit register (WRITE) */ @@ -74,8 +72,8 @@ static void parse_earlyprintk(void) static const int bases[] = { 0x3f8, 0x2f8 }; int idx = 0; - if (!strncmp(arg + pos, "ttyS", 4)) - pos += 4; + /* += strlen("ttyS"); */ + pos += 4; if (arg[pos++] == '1') idx = 1; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 82e8a1d44658..156ebcab4ada 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -179,8 +179,8 @@ sysenter_dispatch: sysexit_from_sys_call: andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS-R11(%rsp) - movl RIP-R11(%rsp),%edx /* User %eip */ + andl $~0x200,EFLAGS-ARGOFFSET(%rsp) + movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx RESTORE_ARGS 0,24,0,0,0,0 xorq %r8,%r8 diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 465b309af254..efc3b22d896e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -106,7 +106,14 @@ extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); -extern int x2apic_mode; +static inline bool apic_is_x2apic_enabled(void) +{ + u64 msr; + + if (rdmsrl_safe(MSR_IA32_APICBASE, &msr)) + return false; + return msr & X2APIC_ENABLE; +} #ifdef CONFIG_X86_X2APIC /* @@ -169,48 +176,23 @@ static inline u64 native_x2apic_icr_read(void) return val; } +extern int x2apic_mode; extern int x2apic_phys; -extern int x2apic_preenabled; -extern void check_x2apic(void); -extern void enable_x2apic(void); +extern void __init check_x2apic(void); +extern void x2apic_setup(void); static inline int x2apic_enabled(void) { - u64 msr; - - if (!cpu_has_x2apic) - return 0; - - rdmsrl(MSR_IA32_APICBASE, msr); - if (msr & X2APIC_ENABLE) - return 1; - return 0; + return cpu_has_x2apic && apic_is_x2apic_enabled(); } #define x2apic_supported() (cpu_has_x2apic) -static inline void x2apic_force_phys(void) -{ - x2apic_phys = 1; -} #else -static inline void disable_x2apic(void) -{ -} -static inline void check_x2apic(void) -{ -} -static inline void enable_x2apic(void) -{ -} -static inline int x2apic_enabled(void) -{ - return 0; -} -static inline void x2apic_force_phys(void) -{ -} +static inline void check_x2apic(void) { } +static inline void x2apic_setup(void) { } +static inline int x2apic_enabled(void) { return 0; } -#define x2apic_preenabled 0 -#define x2apic_supported() 0 +#define x2apic_mode (0) +#define x2apic_supported() (0) #endif extern void enable_IR_x2apic(void); @@ -219,7 +201,6 @@ extern int get_physical_broadcast(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); -extern void connect_bsp_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); @@ -227,14 +208,23 @@ extern int verify_local_APIC(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); -extern void end_local_APIC_setup(void); -extern void bsp_end_local_APIC_setup(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else extern int apic_force_enable(unsigned long addr); +#endif + +extern int apic_bsp_setup(bool upmode); +extern void apic_ap_setup(void); /* * On 32bit this is mach-xxx local diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 76659b67fd11..1f1297b46f83 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with #define SS 160 #define ARGOFFSET R11 -#define SWFRAME ORIG_RAX .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 subq $9*8+\addskip, %rsp diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aede2c347bde..90a54851aedc 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -174,6 +174,7 @@ #define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ /* @@ -388,6 +389,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) #define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) +#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT) #if __GNUC__ >= 4 extern void warn_pre_alternatives(void); diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 61fd18b83b6c..12cb66f6d3a5 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -114,5 +114,10 @@ static inline void debug_stack_usage_inc(void) { } static inline void debug_stack_usage_dec(void) { } #endif /* X86_64 */ +#ifdef CONFIG_CPU_SUP_AMD +extern void set_dr_addr_mask(unsigned long mask, int dr); +#else +static inline void set_dr_addr_mask(unsigned long mask, int dr) { } +#endif #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e97622f57722..0dbc08282291 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -207,7 +207,7 @@ static inline void fpu_fxsave(struct fpu *fpu) if (config_enabled(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave)); + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory * operand uses any extended registers for addressing, a second @@ -290,9 +290,11 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. "m" is a random variable that should be in L1 */ + /* + * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is + * pending. Clear the x87 state here by setting it to fixed values. + * "m" is a random variable that should be in L1. + */ if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index ef1c4d2d41ec..6c98be864a75 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -12,6 +12,7 @@ */ struct arch_hw_breakpoint { unsigned long address; + unsigned long mask; u8 len; u8 type; }; diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ed8089d69094..6eb6fcb83f63 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -40,8 +40,8 @@ extern void __kernel_fpu_end(void); static inline void kernel_fpu_begin(void) { - WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); + WARN_ON_ONCE(!irq_fpu_usable()); __kernel_fpu_begin(); } @@ -51,6 +51,10 @@ static inline void kernel_fpu_end(void) preempt_enable(); } +/* Must be called with preempt disabled */ +extern void kernel_fpu_disable(void); +extern void kernel_fpu_enable(void); + /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index bf006cce9418..2f91685fe1cd 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -279,6 +279,11 @@ static inline void disable_ioapic_support(void) { } #define native_ioapic_set_affinity NULL #define native_setup_ioapic_entry NULL #define native_eoi_ioapic_pin NULL + +static inline void setup_IO_APIC(void) { } +static inline void enable_IO_APIC(void) { } +static inline void setup_ioapic_dest(void) { } + #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index b7747c4c2cf2..6224d316c405 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,8 +33,6 @@ struct irq_cfg; #ifdef CONFIG_IRQ_REMAP -extern void setup_irq_remapping_ops(void); -extern int irq_remapping_supported(void); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); extern int irq_remapping_enable(void); @@ -60,8 +58,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip); #else /* CONFIG_IRQ_REMAP */ -static inline void setup_irq_remapping_ops(void) { } -static inline int irq_remapping_supported(void) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } static inline int irq_remapping_enable(void) { return -ENODEV; } diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 51b26e895933..9b3de99dc004 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -190,7 +190,6 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); -void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h index fc7a17c05d35..bc0fc0866553 100644 --- a/arch/x86/include/asm/pmc_atom.h +++ b/arch/x86/include/asm/pmc_atom.h @@ -53,6 +53,28 @@ /* Sleep state counter is in units of of 32us */ #define PMC_TMR_SHIFT 5 +/* Power status of power islands */ +#define PMC_PSS 0x98 + +#define PMC_PSS_BIT_GBE BIT(0) +#define PMC_PSS_BIT_SATA BIT(1) +#define PMC_PSS_BIT_HDA BIT(2) +#define PMC_PSS_BIT_SEC BIT(3) +#define PMC_PSS_BIT_PCIE BIT(4) +#define PMC_PSS_BIT_LPSS BIT(5) +#define PMC_PSS_BIT_LPE BIT(6) +#define PMC_PSS_BIT_DFX BIT(7) +#define PMC_PSS_BIT_USH_CTRL BIT(8) +#define PMC_PSS_BIT_USH_SUS BIT(9) +#define PMC_PSS_BIT_USH_VCCS BIT(10) +#define PMC_PSS_BIT_USH_VCCA BIT(11) +#define PMC_PSS_BIT_OTG_CTRL BIT(12) +#define PMC_PSS_BIT_OTG_VCCS BIT(13) +#define PMC_PSS_BIT_OTG_VCCA_CLK BIT(14) +#define PMC_PSS_BIT_OTG_VCCA BIT(15) +#define PMC_PSS_BIT_USB BIT(16) +#define PMC_PSS_BIT_USB_SUS BIT(17) + /* These registers reflect D3 status of functions */ #define PMC_D3_STS_0 0xA0 diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h deleted file mode 100644 index 0da7409f0bec..000000000000 --- a/arch/x86/include/asm/smpboot_hooks.h +++ /dev/null @@ -1,68 +0,0 @@ -/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws - * which needs to alter them. */ - -static inline void smpboot_clear_io_apic_irqs(void) -{ -#ifdef CONFIG_X86_IO_APIC - io_apic_irqs = 0; -#endif -} - -static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) -{ - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0xa, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = - start_eip >> 4; - pr_debug("2.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = - start_eip & 0xf; - pr_debug("3.\n"); -} - -static inline void smpboot_restore_warm_reset_vector(void) -{ - unsigned long flags; - - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); - - *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; -} - -static inline void __init smpboot_setup_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else { - nr_ioapics = 0; - } -#endif -} - -static inline void smpboot_clear_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - nr_ioapics = 0; -#endif -} diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc6..e82e95abc92b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -75,7 +75,6 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -100,7 +99,6 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) @@ -140,7 +138,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ @@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void) return ti; } +static inline unsigned long current_stack_pointer(void) +{ + unsigned long sp; +#ifdef CONFIG_X86_64 + asm("mov %%rsp,%0" : "=g" (sp)); +#else + asm("mov %%esp,%0" : "=g" (sp)); +#endif + return sp; +} + #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 707adc6549d8..4e49d7dff78e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H +#include <linux/context_tracking_state.h> #include <linux/kprobes.h> #include <asm/debugreg.h> @@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +extern enum ctx_state ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_begin_non_atomic(struct pt_regs *regs); +extern void ist_end_non_atomic(void); + /* Interrupts/Exceptions */ enum { X86_TRAP_DE = 0, /* 0, Divide-by-zero */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c8aa65d56027..d979e5abae55 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -251,6 +251,10 @@ /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 +#define MSR_F16H_DR1_ADDR_MASK 0xc0011019 +#define MSR_F16H_DR2_ADDR_MASK 0xc001101a +#define MSR_F16H_DR3_ADDR_MASK 0xc001101b +#define MSR_F16H_DR0_ADDR_MASK 0xc0011027 /* Fam 15h MSRs */ #define MSR_F15H_PERF_CTL 0xc0010200 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b9e30daa0881..a18fff361c7f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -653,6 +653,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, return gsi; } +#ifdef CONFIG_X86_LOCAL_APIC static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { @@ -675,6 +676,7 @@ static void acpi_unregister_gsi_ioapic(u32 gsi) mutex_unlock(&acpi_ioapic_lock); #endif } +#endif int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index b708738d016e..6a7c23ff21d3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -135,14 +135,6 @@ static inline void apbt_clear_mapping(void) apbt_virt_address = NULL; } -/* - * APBT timer interrupt enable / disable - */ -static inline int is_apbt_capable(void) -{ - return apbt_virt_address ? 1 : 0; -} - static int __init apbt_clockevent_register(void) { struct sfi_timer_table_entry *mtmr; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 29b5b18afa27..b665d241efad 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -134,9 +134,6 @@ static inline void imcr_apic_to_pic(void) */ static int force_enable_local_apic __initdata; -/* Control whether x2APIC mode is enabled or not */ -static bool nox2apic __initdata; - /* * APIC command line parameters */ @@ -161,33 +158,6 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -int x2apic_mode; -#ifdef CONFIG_X86_X2APIC -/* x2apic enabled before OS handover */ -int x2apic_preenabled; -static int x2apic_disabled; -static int __init setup_nox2apic(char *str) -{ - if (x2apic_enabled()) { - int apicid = native_apic_msr_read(APIC_ID); - - if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); - return 0; - } - - pr_warning("x2apic already enabled. will disable it\n"); - } else - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - - nox2apic = true; - - return 0; -} -early_param("nox2apic", setup_nox2apic); -#endif - unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ @@ -1475,7 +1445,7 @@ void setup_local_APIC(void) #endif } -void end_local_APIC_setup(void) +static void end_local_APIC_setup(void) { lapic_setup_esr(); @@ -1492,116 +1462,184 @@ void end_local_APIC_setup(void) apic_pm_activate(); } -void __init bsp_end_local_APIC_setup(void) +/* + * APIC setup function for application processors. Called from smpboot.c + */ +void apic_ap_setup(void) { + setup_local_APIC(); end_local_APIC_setup(); - - /* - * Now that local APIC setup is completed for BP, configure the fault - * handling for interrupt remapping. - */ - irq_remap_enable_fault_handling(); - } #ifdef CONFIG_X86_X2APIC -/* - * Need to disable xapic and x2apic at the same time and then enable xapic mode - */ -static inline void __disable_x2apic(u64 msr) -{ - wrmsrl(MSR_IA32_APICBASE, - msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); - wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); -} +int x2apic_mode; -static __init void disable_x2apic(void) +enum { + X2APIC_OFF, + X2APIC_ON, + X2APIC_DISABLED, +}; +static int x2apic_state; + +static inline void __x2apic_disable(void) { u64 msr; - if (!cpu_has_x2apic) + if (cpu_has_apic) return; rdmsrl(MSR_IA32_APICBASE, msr); - if (msr & X2APIC_ENABLE) { - u32 x2apic_id = read_apic_id(); - - if (x2apic_id >= 255) - panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + if (!(msr & X2APIC_ENABLE)) + return; + /* Disable xapic and x2apic first and then reenable xapic mode */ + wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); + printk_once(KERN_INFO "x2apic disabled\n"); +} - pr_info("Disabling x2apic\n"); - __disable_x2apic(msr); +static inline void __x2apic_enable(void) +{ + u64 msr; - if (nox2apic) { - clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - } + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) + return; + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + printk_once(KERN_INFO "x2apic enabled\n"); +} - x2apic_disabled = 1; - x2apic_mode = 0; +static int __init setup_nox2apic(char *str) +{ + if (x2apic_enabled()) { + int apicid = native_apic_msr_read(APIC_ID); - register_lapic_address(mp_lapic_addr); + if (apicid >= 255) { + pr_warning("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + pr_warning("x2apic already enabled.\n"); + __x2apic_disable(); } + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + x2apic_state = X2APIC_DISABLED; + x2apic_mode = 0; + return 0; } +early_param("nox2apic", setup_nox2apic); -void check_x2apic(void) +/* Called from cpu_init() to enable x2apic on (secondary) cpus */ +void x2apic_setup(void) { - if (x2apic_enabled()) { - pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic_mode = 1; + /* + * If x2apic is not in ON state, disable it if already enabled + * from BIOS. + */ + if (x2apic_state != X2APIC_ON) { + __x2apic_disable(); + return; } + __x2apic_enable(); } -void enable_x2apic(void) +static __init void x2apic_disable(void) { - u64 msr; + u32 x2apic_id; - rdmsrl(MSR_IA32_APICBASE, msr); - if (x2apic_disabled) { - __disable_x2apic(msr); + if (x2apic_state != X2APIC_ON) + goto out; + + x2apic_id = read_apic_id(); + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + __x2apic_disable(); + register_lapic_address(mp_lapic_addr); +out: + x2apic_state = X2APIC_DISABLED; + x2apic_mode = 0; +} + +static __init void x2apic_enable(void) +{ + if (x2apic_state != X2APIC_OFF) return; - } - if (!x2apic_mode) + x2apic_mode = 1; + x2apic_state = X2APIC_ON; + __x2apic_enable(); +} + +static __init void try_to_enable_x2apic(int remap_mode) +{ + if (x2apic_state == X2APIC_DISABLED) return; - if (!(msr & X2APIC_ENABLE)) { - printk_once(KERN_INFO "Enabling x2apic\n"); - wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + if (remap_mode != IRQ_REMAP_X2APIC_MODE) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || + (IS_ENABLED(CONFIG_HYPERVISOR_GUEST) && + !hypervisor_x2apic_available())) { + pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); + x2apic_disable(); + return; + } + + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_phys = 1; } + x2apic_enable(); } -#endif /* CONFIG_X86_X2APIC */ -int __init enable_IR(void) +void __init check_x2apic(void) { -#ifdef CONFIG_IRQ_REMAP - if (!irq_remapping_supported()) { - pr_debug("intr-remapping not supported\n"); - return -1; + if (x2apic_enabled()) { + pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); + x2apic_mode = 1; + x2apic_state = X2APIC_ON; + } else if (!cpu_has_x2apic) { + x2apic_state = X2APIC_DISABLED; } +} +#else /* CONFIG_X86_X2APIC */ +static int __init validate_x2apic(void) +{ + if (!apic_is_x2apic_enabled()) + return 0; + /* + * Checkme: Can we simply turn off x2apic here instead of panic? + */ + panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n"); +} +early_initcall(validate_x2apic); - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling intr-remap because of skipping " - "io-apic setup\n"); +static inline void try_to_enable_x2apic(int remap_mode) { } +static inline void __x2apic_enable(void) { } +#endif /* !CONFIG_X86_X2APIC */ + +static int __init try_to_enable_IR(void) +{ +#ifdef CONFIG_X86_IO_APIC + if (!x2apic_enabled() && skip_ioapic_setup) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return -1; } - - return irq_remapping_enable(); #endif - return -1; + return irq_remapping_enable(); } void __init enable_IR_x2apic(void) { unsigned long flags; - int ret, x2apic_enabled = 0; - int hardware_init_ret; - - /* Make sure irq_remap_ops are initialized */ - setup_irq_remapping_ops(); + int ret, ir_stat; - hardware_init_ret = irq_remapping_prepare(); - if (hardware_init_ret && !x2apic_supported()) + ir_stat = irq_remapping_prepare(); + if (ir_stat < 0 && !x2apic_supported()) return; ret = save_ioapic_entries(); @@ -1614,49 +1652,13 @@ void __init enable_IR_x2apic(void) legacy_pic->mask_all(); mask_ioapic_entries(); - if (x2apic_preenabled && nox2apic) - disable_x2apic(); - - if (hardware_init_ret) - ret = -1; - else - ret = enable_IR(); - - if (!x2apic_supported()) - goto skip_x2apic; + /* If irq_remapping_prepare() succeded, try to enable it */ + if (ir_stat >= 0) + ir_stat = try_to_enable_IR(); + /* ir_stat contains the remap mode or an error code */ + try_to_enable_x2apic(ir_stat); - if (ret < 0) { - /* IR is required if there is APIC ID > 255 even when running - * under KVM - */ - if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) { - if (x2apic_preenabled) - disable_x2apic(); - goto skip_x2apic; - } - /* - * without IR all CPUs can be addressed by IOAPIC/MSI - * only in physical mode - */ - x2apic_force_phys(); - } - - if (ret == IRQ_REMAP_XAPIC_MODE) { - pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); - goto skip_x2apic; - } - - x2apic_enabled = 1; - - if (x2apic_supported() && !x2apic_mode) { - x2apic_mode = 1; - enable_x2apic(); - pr_info("Enabled x2apic\n"); - } - -skip_x2apic: - if (ret < 0) /* IR enabling failed */ + if (ir_stat < 0) restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); @@ -1847,82 +1849,8 @@ void __init register_lapic_address(unsigned long address) } } -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ int apic_version[MAX_LOCAL_APIC]; -int __init APIC_init_uniprocessor(void) -{ - if (disable_apic) { - pr_info("Apic disabled\n"); - return -1; - } -#ifdef CONFIG_X86_64 - if (!cpu_has_apic) { - disable_apic = 1; - pr_info("Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !cpu_has_apic) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - pr_err("BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); - return -1; - } -#endif - - default_setup_apic_routing(); - - verify_local_APIC(); - connect_bsp_APIC(); - -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); - -#ifdef CONFIG_X86_IO_APIC - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling error vector - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); -#endif - - bsp_end_local_APIC_setup(); - -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else { - nr_ioapics = 0; - } -#endif - - x86_init.timers.setup_percpu_clockev(); - return 0; -} - /* * Local APIC interrupts */ @@ -2027,7 +1955,7 @@ __visible void smp_trace_error_interrupt(struct pt_regs *regs) /** * connect_bsp_APIC - attach the APIC to the interrupt system */ -void __init connect_bsp_APIC(void) +static void __init connect_bsp_APIC(void) { #ifdef CONFIG_X86_32 if (pic_mode) { @@ -2274,6 +2202,100 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) } } +static void __init apic_bsp_up_setup(void) +{ +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); +} + +/** + * apic_bsp_setup - Setup function for local apic and io-apic + * @upmode: Force UP mode (for APIC_init_uniprocessor) + * + * Returns: + * apic_id of BSP APIC + */ +int __init apic_bsp_setup(bool upmode) +{ + int id; + + connect_bsp_APIC(); + if (upmode) + apic_bsp_up_setup(); + setup_local_APIC(); + + if (x2apic_mode) + id = apic_read(APIC_LDR); + else + id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + + enable_IO_APIC(); + end_local_APIC_setup(); + irq_remap_enable_fault_handling(); + setup_IO_APIC(); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); + return id; +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ + if (disable_apic) { + pr_info("Apic disabled\n"); + return -1; + } +#ifdef CONFIG_X86_64 + if (!cpu_has_apic) { + disable_apic = 1; + pr_info("Apic disabled by BIOS\n"); + return -1; + } +#else + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + pr_err("BIOS bug, local APIC 0x%x not detected!...\n", + boot_cpu_physical_apicid); + return -1; + } +#endif + + if (!smp_found_config) + disable_ioapic_support(); + + default_setup_apic_routing(); + verify_local_APIC(); + apic_bsp_setup(true); + return 0; +} + +#ifdef CONFIG_UP_LATE_INIT +void __init up_late_init(void) +{ + APIC_init_uniprocessor(); +} +#endif + /* * Power management */ @@ -2359,9 +2381,9 @@ static void lapic_resume(void) mask_ioapic_entries(); legacy_pic->mask_all(); - if (x2apic_mode) - enable_x2apic(); - else { + if (x2apic_mode) { + __x2apic_enable(); + } else { /* * Make sure the APICBASE points to the right address * diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3f5f60406ab1..f4dc2462a1ac 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1507,7 +1507,10 @@ void __init enable_IO_APIC(void) int i8259_apic, i8259_pin; int apic, pin; - if (!nr_legacy_irqs()) + if (skip_ioapic_setup) + nr_ioapics = 0; + + if (!nr_legacy_irqs() || !nr_ioapics) return; for_each_ioapic_pin(apic, pin) { @@ -2295,7 +2298,7 @@ static inline void __init check_timer(void) } local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - if (x2apic_preenabled) + if (apic_is_x2apic_enabled()) apic_printk(APIC_QUIET, KERN_INFO "Perhaps problem with the pre-enabled x2apic mode\n" "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); @@ -2373,9 +2376,9 @@ void __init setup_IO_APIC(void) { int ioapic; - /* - * calling enable_IO_APIC() is moved to setup_local_APIC for BP - */ + if (skip_ioapic_setup || !nr_ioapics) + return; + io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 15c5df92f74e..a220239cea65 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -869,3 +869,22 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) return false; } + +void set_dr_addr_mask(unsigned long mask, int dr) +{ + if (!cpu_has_bpext) + return; + + switch (dr) { + case 0: + wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0); + break; + case 1: + case 2: + case 3: + wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0); + break; + default: + break; + } +} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c6049650c093..b15bffcaba6d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -491,17 +491,18 @@ u16 __read_mostly tlb_lld_2m[NR_INFO]; u16 __read_mostly tlb_lld_4m[NR_INFO]; u16 __read_mostly tlb_lld_1g[NR_INFO]; -void cpu_detect_tlb(struct cpuinfo_x86 *c) +static void cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); - printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" - "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", + pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], - tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], - tlb_lld_1g[ENTRIES]); + tlb_lli_4m[ENTRIES]); + + pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", + tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], + tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } void detect_ht(struct cpuinfo_x86 *c) @@ -1332,7 +1333,7 @@ void cpu_init(void) barrier(); x86_configure_nx(); - enable_x2apic(); + x2apic_setup(); /* * set up and load the per-CPU TSS diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9cc6b6f25f42..94d7dcb12145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -487,10 +487,8 @@ static void init_intel(struct cpuinfo_x86 *c) rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { - printk_once(KERN_WARNING "ENERGY_PERF_BIAS:" - " Set to 'normal', was 'performance'\n" - "ENERGY_PERF_BIAS: View and update with" - " x86_energy_perf_policy(8)\n"); + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d2c611699cd9..cdfed7953963 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <asm/processor.h> +#include <asm/traps.h> #include <asm/mce.h> #include <asm/msr.h> @@ -115,7 +116,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -311,7 +312,7 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static void mce_panic(char *msg, struct mce *final, char *exp) +static void mce_panic(const char *msg, struct mce *final, char *exp) { int i, apei_err = 0; @@ -529,7 +530,7 @@ static void mce_schedule_work(void) schedule_work(this_cpu_ptr(&mce_work)); } -DEFINE_PER_CPU(struct irq_work, mce_irq_work); +static DEFINE_PER_CPU(struct irq_work, mce_irq_work); static void mce_irq_work_cb(struct irq_work *entry) { @@ -735,7 +736,7 @@ static atomic_t mce_callin; /* * Check if a timeout waiting for other CPUs happened. */ -static int mce_timed_out(u64 *t) +static int mce_timed_out(u64 *t, const char *msg) { /* * The others already did panic for some reason. @@ -750,8 +751,7 @@ static int mce_timed_out(u64 *t) goto out; if ((s64)*t < SPINUNIT) { if (mca_cfg.tolerant <= 1) - mce_panic("Timeout synchronizing machine check over CPUs", - NULL, NULL); + mce_panic(msg, NULL, NULL); cpu_missing = 1; return 1; } @@ -867,7 +867,8 @@ static int mce_start(int *no_way_out) * Wait for everyone. */ while (atomic_read(&mce_callin) != cpus) { - if (mce_timed_out(&timeout)) { + if (mce_timed_out(&timeout, + "Timeout: Not all CPUs entered broadcast exception handler")) { atomic_set(&global_nwo, 0); return -1; } @@ -892,7 +893,8 @@ static int mce_start(int *no_way_out) * only seen by one CPU before cleared, avoiding duplicates. */ while (atomic_read(&mce_executing) < order) { - if (mce_timed_out(&timeout)) { + if (mce_timed_out(&timeout, + "Timeout: Subject CPUs unable to finish machine check processing")) { atomic_set(&global_nwo, 0); return -1; } @@ -936,7 +938,8 @@ static int mce_end(int order) * loops. */ while (atomic_read(&mce_executing) <= cpus) { - if (mce_timed_out(&timeout)) + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU unable to finish machine check processing")) goto reset; ndelay(SPINUNIT); } @@ -949,7 +952,8 @@ static int mce_end(int order) * Subject: Wait for Monarch to finish. */ while (atomic_read(&mce_executing) != 0) { - if (mce_timed_out(&timeout)) + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU did not finish machine check processing")) goto reset; ndelay(SPINUNIT); } @@ -1003,51 +1007,6 @@ static void mce_clear_state(unsigned long *toclear) } /* - * Need to save faulting physical address associated with a process - * in the machine check handler some place where we can grab it back - * later in mce_notify_process() - */ -#define MCE_INFO_MAX 16 - -struct mce_info { - atomic_t inuse; - struct task_struct *t; - __u64 paddr; - int restartable; -} mce_info[MCE_INFO_MAX]; - -static void mce_save_info(__u64 addr, int c) -{ - struct mce_info *mi; - - for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { - if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { - mi->t = current; - mi->paddr = addr; - mi->restartable = c; - return; - } - } - - mce_panic("Too many concurrent recoverable errors", NULL, NULL); -} - -static struct mce_info *mce_find_info(void) -{ - struct mce_info *mi; - - for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) - if (atomic_read(&mi->inuse) && mi->t == current) - return mi; - return NULL; -} - -static void mce_clear_info(struct mce_info *mi) -{ - atomic_set(&mi->inuse, 0); -} - -/* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. * @@ -1063,6 +1022,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mca_config *cfg = &mca_cfg; struct mce m, *final; + enum ctx_state prev_state; int i; int worst = 0; int severity; @@ -1084,6 +1044,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; + u64 recover_paddr = ~0ull; + int flags = MF_ACTION_REQUIRED; + + prev_state = ist_enter(regs); this_cpu_inc(mce_exception_count); @@ -1203,9 +1167,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (no_way_out) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { - /* schedule action before return to userland */ - mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); - set_thread_flag(TIF_MCE_NOTIFY); + recover_paddr = m.addr; + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; } else if (kill_it) { force_sig(SIGBUS, current); } @@ -1216,6 +1180,27 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: sync_core(); + + if (recover_paddr == ~0ull) + goto done; + + pr_err("Uncorrected hardware memory error in user-access at %llx", + recover_paddr); + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + ist_begin_non_atomic(regs); + local_irq_enable(); + if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + local_irq_disable(); + ist_end_non_atomic(); +done: + ist_exit(regs, prev_state); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1233,42 +1218,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Called in process context that interrupted by MCE and marked with - * TIF_MCE_NOTIFY, just before returning to erroneous userland. - * This code is allowed to sleep. - * Attempt possible recovery such as calling the high level VM handler to - * process any corrupted pages, and kill/signal current process if required. - * Action required errors are handled here. - */ -void mce_notify_process(void) -{ - unsigned long pfn; - struct mce_info *mi = mce_find_info(); - int flags = MF_ACTION_REQUIRED; - - if (!mi) - mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); - pfn = mi->paddr >> PAGE_SHIFT; - - clear_thread_flag(TIF_MCE_NOTIFY); - - pr_err("Uncorrected hardware memory error in user-access at %llx", - mi->paddr); - /* - * We must call memory_failure() here even if the current process is - * doomed. We still need to mark the page as poisoned and alert any - * other users of the page. - */ - if (!mi->restartable) - flags |= MF_MUST_KILL; - if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { - pr_err("Memory error not recovered"); - force_sig(SIGBUS, current); - } - mce_clear_info(mi); -} - -/* * Action optional processing happens here (picking up * from the list of faulting pages that do_machine_check() * placed into the "ring"). diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index a3042989398c..ec2663a708e4 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -8,6 +8,7 @@ #include <linux/smp.h> #include <asm/processor.h> +#include <asm/traps.h> #include <asm/mce.h> #include <asm/msr.h> @@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { + enum ctx_state prev_state; u32 loaddr, hi, lotype; + prev_state = ist_enter(regs); + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + ist_exit(regs, prev_state); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 7dc5564d0cdf..bd5d46a32210 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -7,14 +7,19 @@ #include <linux/types.h> #include <asm/processor.h> +#include <asm/traps.h> #include <asm/mce.h> #include <asm/msr.h> /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { + enum ctx_state prev_state = ist_enter(regs); + printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + ist_exit(regs, prev_state); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index dd2f07ae9d0c..46201deee923 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -184,9 +184,9 @@ void __init e820_print_map(char *who) * overwritten in the same location, starting at biosmap. * * The integer pointed to by pnr_map must be valid on entry (the - * current number of valid entries located at biosmap) and will - * be updated on return, with the new number of valid entries - * (something no more than max_nr_map.) + * current number of valid entries located at biosmap). If the + * sanitizing succeeds the *pnr_map will be updated with the new + * number of valid entries (something no more than max_nr_map). * * The return value from sanitize_e820_map() is zero if it * successfully 'sanitized' the map entries passed in, and is -1 @@ -561,23 +561,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - u32 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map)) return; - e820.nr_map = nr_map; printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) { - u32 nr_map; - - nr_map = e820_saved.nr_map; - if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) - return; - e820_saved.nr_map = nr_map; + sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), + &e820_saved.nr_map); } #define MAX_GAP_END 0x100000000ull /* @@ -898,11 +890,9 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - u32 nr = e820.nr_map; - - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), + &e820.nr_map) < 0) early_panic("Invalid user supplied memory map"); - e820.nr_map = nr; printk(KERN_INFO "e820: user-defined physical RAM map:\n"); e820_print_map("user"); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9ebaf63ba182..db13655c3a2a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64) movq \tmp,RSP+\offset(%rsp) movq $__USER_DS,SS+\offset(%rsp) movq $__USER_CS,CS+\offset(%rsp) - movq $-1,RCX+\offset(%rsp) + movq RIP+\offset(%rsp),\tmp /* get rip */ + movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ movq R11+\offset(%rsp),\tmp /* get eflags */ movq \tmp,EFLAGS+\offset(%rsp) .endm @@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64) movq \tmp,R11+\offset(%rsp) .endm - .macro FAKE_STACK_FRAME child_rip - /* push in order ss, rsp, eflags, cs, rip */ - xorl %eax, %eax - pushq_cfi $__KERNEL_DS /* ss */ - /*CFI_REL_OFFSET ss,0*/ - pushq_cfi %rax /* rsp */ - CFI_REL_OFFSET rsp,0 - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */ - /*CFI_REL_OFFSET rflags,0*/ - pushq_cfi $__KERNEL_CS /* cs */ - /*CFI_REL_OFFSET cs,0*/ - pushq_cfi \child_rip /* rip */ - CFI_REL_OFFSET rip,0 - pushq_cfi %rax /* orig rax */ - .endm - - .macro UNFAKE_STACK_FRAME - addq $8*6, %rsp - CFI_ADJUST_CFA_OFFSET -(6*8) - .endm - /* * initial frame state for interrupts (and exceptions without error code) */ @@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64) CFI_REL_OFFSET r15, R15+\offset .endm -/* save partial stack frame */ - .macro SAVE_ARGS_IRQ - cld - /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, (RDI-RBP) - movq_cfi rsi, (RSI-RBP) - movq_cfi rdx, (RDX-RBP) - movq_cfi rcx, (RCX-RBP) - movq_cfi rax, (RAX-RBP) - movq_cfi r8, (R8-RBP) - movq_cfi r9, (R9-RBP) - movq_cfi r10, (R10-RBP) - movq_cfi r11, (R11-RBP) - - /* Save rbp so that we can unwind from get_irq_regs() */ - movq_cfi rbp, 0 - - /* Save previous stack value */ - movq %rsp, %rsi - - leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS-RBP(%rsi) - je 1f - SWAPGS - /* - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ -1: incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - CFI_DEF_CFA_REGISTER rsi - - /* Store previous stack value */ - pushq %rsi - CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 */, 0, \ - 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SS+8-RBP, \ - 0x22 /* DW_OP_plus */ - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - .endm - ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -426,15 +361,12 @@ system_call_fastpath: * Has incomplete stack frame and undefined top of stack. */ ret_from_sys_call: - movl $_TIF_ALLWORK_MASK,%edi - /* edi: flagmask */ -sysret_check: + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz int_ret_from_sys_call_fixup /* Go the the slow path */ + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx - andl %edi,%edx - jnz sysret_careful CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: @@ -448,49 +380,10 @@ sysret_check: USERGS_SYSRET64 CFI_RESTORE_STATE - /* Handle reschedules */ - /* edx: work, edi: workmask */ -sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi - SCHEDULE_USER - popq_cfi %rdi - jmp sysret_check - /* Handle a signal */ -sysret_signal: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) -#ifdef CONFIG_AUDITSYSCALL - bt $TIF_SYSCALL_AUDIT,%edx - jc sysret_audit -#endif - /* - * We have a signal, or exit tracing or single-step. - * These all wind up with the iret return path anyway, - * so just join that path right now. - */ +int_ret_from_sys_call_fixup: FIXUP_TOP_OF_STACK %r11, -ARGOFFSET - jmp int_check_syscall_exit_work - -#ifdef CONFIG_AUDITSYSCALL - /* - * Return fast path for syscall audit. Call __audit_syscall_exit() - * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT - * masked off. - */ -sysret_audit: - movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - jmp sysret_check -#endif /* CONFIG_AUDITSYSCALL */ + jmp int_ret_from_sys_call /* Do syscall tracing */ tracesys: @@ -626,19 +519,6 @@ END(\label) FORK_LIKE vfork FIXED_FRAME stub_iopl, sys_iopl -ENTRY(ptregscall_common) - DEFAULT_FRAME 1 8 /* offset 8: return address */ - RESTORE_TOP_OF_STACK %r11, 8 - movq_cfi_restore R15+8, r15 - movq_cfi_restore R14+8, r14 - movq_cfi_restore R13+8, r13 - movq_cfi_restore R12+8, r12 - movq_cfi_restore RBP+8, rbp - movq_cfi_restore RBX+8, rbx - ret $REST_SKIP /* pop extended registers */ - CFI_ENDPROC -END(ptregscall_common) - ENTRY(stub_execve) CFI_STARTPROC addq $8, %rsp @@ -779,7 +659,48 @@ END(interrupt) /* reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP - SAVE_ARGS_IRQ + cld + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, (RDI-RBP) + movq_cfi rsi, (RSI-RBP) + movq_cfi rdx, (RDX-RBP) + movq_cfi rcx, (RCX-RBP) + movq_cfi rax, (RAX-RBP) + movq_cfi r8, (R8-RBP) + movq_cfi r9, (R9-RBP) + movq_cfi r10, (R10-RBP) + movq_cfi r11, (R11-RBP) + + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + testl $3, CS-RBP(%rsi) + je 1f + SWAPGS + /* + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + CFI_DEF_CFA_REGISTER rsi + + /* Store previous stack value */ + pushq %rsi + CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ + 0x77 /* DW_OP_breg7 */, 0, \ + 0x06 /* DW_OP_deref */, \ + 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x22 /* DW_OP_plus */ + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + call \func .endm @@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */ */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq (RCX-R11)(%rsp), %rcx + cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. It's not worth + * testing for canonicalness exactly -- this check detects any + * of the 17 high bits set, which is true for non-canonical + * or kernel addresses. (This will pessimize vsyscall=native. + * Big deal.) + * + * If virtual addresses ever become wider, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- sysret checks need update" + .endif + shr $__VIRTUAL_MASK_SHIFT, %rcx + jnz opportunistic_sysret_failed + + cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq (R11-ARGOFFSET)(%rsp), %r11 + cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +irq_return_via_sysret: + CFI_REMEMBER_STATE + RESTORE_ARGS 1,8,1 + movq (RSP-RIP)(%rsp),%rsp + USERGS_SYSRET64 + CFI_RESTORE_STATE + +opportunistic_sysret_failed: SWAPGS jmp restore_args @@ -1048,6 +1023,11 @@ ENTRY(\sym) CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 .if \paranoid + .if \paranoid == 1 + CFI_REMEMBER_STATE + testl $3, CS(%rsp) /* If coming from userspace, switch */ + jnz 1f /* stacks. */ + .endif call save_paranoid .else call error_entry @@ -1088,6 +1068,36 @@ ENTRY(\sym) jmp error_exit /* %ebx: no swapgs flag */ .endif + .if \paranoid == 1 + CFI_RESTORE_STATE + /* + * Paranoid entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +1: + call error_entry + + DEFAULT_FRAME 0 + + movq %rsp,%rdi /* pt_regs pointer */ + call sync_regs + movq %rax,%rsp /* switch stack */ + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + call \do_sym + + jmp error_exit /* %ebx: no swapgs flag */ + .endif + CFI_ENDPROC END(\sym) .endm @@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=1 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( #endif /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. * - * "trace" is 0 for the NMI handler only, because irq-tracing - * is fundamentally NMI-unsafe. (we cannot change the soft and - * hard flags at once, atomically) + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. */ /* ebx: no swapgs flag */ @@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK RESTORE_ALL 8 - jmp irq_return + INTERRUPT_RETURN paranoid_restore: TRACE_IRQS_IRETQ_DEBUG 0 RESTORE_ALL 8 - jmp irq_return -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp paranoid_userspace -paranoid_schedule: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - SCHEDULE_USER - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - jmp paranoid_userspace + INTERRUPT_RETURN CFI_ENDPROC END(paranoid_exit) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 3d5fb509bdeb..7114ba220fd4 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -126,6 +126,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp) *dr7 |= encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); + if (info->mask) + set_dr_addr_mask(info->mask, i); return 0; } @@ -161,29 +163,8 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) *dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); -} - -static int get_hbp_len(u8 hbp_len) -{ - unsigned int len_in_bytes = 0; - - switch (hbp_len) { - case X86_BREAKPOINT_LEN_1: - len_in_bytes = 1; - break; - case X86_BREAKPOINT_LEN_2: - len_in_bytes = 2; - break; - case X86_BREAKPOINT_LEN_4: - len_in_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86_BREAKPOINT_LEN_8: - len_in_bytes = 8; - break; -#endif - } - return len_in_bytes; + if (info->mask) + set_dr_addr_mask(0, i); } /* @@ -196,7 +177,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) struct arch_hw_breakpoint *info = counter_arch_bp(bp); va = info->address; - len = get_hbp_len(info->len); + len = bp->attr.bp_len; return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } @@ -277,6 +258,8 @@ static int arch_build_bp_info(struct perf_event *bp) } /* Len */ + info->mask = 0; + switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: info->len = X86_BREAKPOINT_LEN_1; @@ -293,11 +276,17 @@ static int arch_build_bp_info(struct perf_event *bp) break; #endif default: - return -EINVAL; + if (!is_power_of_2(bp->attr.bp_len)) + return -EINVAL; + if (!cpu_has_bpext) + return -EOPNOTSUPP; + info->mask = bp->attr.bp_len - 1; + info->len = X86_BREAKPOINT_LEN_1; } return 0; } + /* * Validate the arch-specific HW Breakpoint register settings */ @@ -312,11 +301,11 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) if (ret) return ret; - ret = -EINVAL; - switch (info->len) { case X86_BREAKPOINT_LEN_1: align = 0; + if (info->mask) + align = info->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; @@ -330,7 +319,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) break; #endif default: - return ret; + WARN_ON_ONCE(1); } /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index a9a4229f6161..81049ffab2d6 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -19,6 +19,19 @@ #include <asm/fpu-internal.h> #include <asm/user.h> +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +void kernel_fpu_disable(void) +{ + WARN_ON(this_cpu_read(in_kernel_fpu)); + this_cpu_write(in_kernel_fpu, true); +} + +void kernel_fpu_enable(void) +{ + this_cpu_write(in_kernel_fpu, false); +} + /* * Were we in an interrupt that interrupted kernel mode? * @@ -33,6 +46,9 @@ */ static inline bool interrupted_kernel_fpu_idle(void) { + if (this_cpu_read(in_kernel_fpu)) + return false; + if (use_eager_fpu()) return __thread_has_fpu(current); @@ -73,10 +89,10 @@ void __kernel_fpu_begin(void) { struct task_struct *me = current; + this_cpu_write(in_kernel_fpu, true); + if (__thread_has_fpu(me)) { - __thread_clear_has_fpu(me); __save_init_fpu(me); - /* We do 'stts()' in __kernel_fpu_end() */ } else if (!use_eager_fpu()) { this_cpu_write(fpu_owner_task, NULL); clts(); @@ -86,19 +102,16 @@ EXPORT_SYMBOL(__kernel_fpu_begin); void __kernel_fpu_end(void) { - if (use_eager_fpu()) { - /* - * For eager fpu, most the time, tsk_used_math() is true. - * Restore the user math as we are done with the kernel usage. - * At few instances during thread exit, signal handling etc, - * tsk_used_math() is false. Those few places will take proper - * actions, so we don't need to restore the math here. - */ - if (likely(tsk_used_math(current))) - math_state_restore(); - } else { + struct task_struct *me = current; + + if (__thread_has_fpu(me)) { + if (WARN_ON(restore_fpu_checking(me))) + drop_init_fpu(me); + } else if (!use_eager_fpu()) { stts(); } + + this_cpu_write(in_kernel_fpu, false); } EXPORT_SYMBOL(__kernel_fpu_end); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 63ce838e5a54..28d28f5eb8f4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack) : "memory", "cc", "edx", "ecx", "eax"); } -/* how to get the current stack pointer from C */ -#define current_stack_pointer ({ \ - unsigned long sp; \ - asm("mov %%esp,%0" : "=g" (sp)); \ - sp; \ -}) - static inline void *current_stack(void) { - return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); } static inline int @@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer; + *prev_esp = current_stack_pointer(); if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -156,7 +149,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer; + *prev_esp = current_stack_pointer(); call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c index 0ee5025e0fa4..d66a4fe6caee 100644 --- a/arch/x86/kernel/pmc_atom.c +++ b/arch/x86/kernel/pmc_atom.c @@ -25,8 +25,6 @@ #include <asm/pmc_atom.h> -#define DRIVER_NAME KBUILD_MODNAME - struct pmc_dev { u32 base_addr; void __iomem *regmap; @@ -38,12 +36,12 @@ struct pmc_dev { static struct pmc_dev pmc_device; static u32 acpi_base_addr; -struct pmc_dev_map { +struct pmc_bit_map { const char *name; u32 bit_mask; }; -static const struct pmc_dev_map dev_map[] = { +static const struct pmc_bit_map dev_map[] = { {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, @@ -82,6 +80,27 @@ static const struct pmc_dev_map dev_map[] = { {"35 - DFX", BIT_DFX}, }; +static const struct pmc_bit_map pss_map[] = { + {"0 - GBE", PMC_PSS_BIT_GBE}, + {"1 - SATA", PMC_PSS_BIT_SATA}, + {"2 - HDA", PMC_PSS_BIT_HDA}, + {"3 - SEC", PMC_PSS_BIT_SEC}, + {"4 - PCIE", PMC_PSS_BIT_PCIE}, + {"5 - LPSS", PMC_PSS_BIT_LPSS}, + {"6 - LPE", PMC_PSS_BIT_LPE}, + {"7 - DFX", PMC_PSS_BIT_DFX}, + {"8 - USH_CTRL", PMC_PSS_BIT_USH_CTRL}, + {"9 - USH_SUS", PMC_PSS_BIT_USH_SUS}, + {"10 - USH_VCCS", PMC_PSS_BIT_USH_VCCS}, + {"11 - USH_VCCA", PMC_PSS_BIT_USH_VCCA}, + {"12 - OTG_CTRL", PMC_PSS_BIT_OTG_CTRL}, + {"13 - OTG_VCCS", PMC_PSS_BIT_OTG_VCCS}, + {"14 - OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK}, + {"15 - OTG_VCCA", PMC_PSS_BIT_OTG_VCCA}, + {"16 - USB", PMC_PSS_BIT_USB}, + {"17 - USB_SUS", PMC_PSS_BIT_USB_SUS}, +}; + static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset) { return readl(pmc->regmap + reg_offset); @@ -169,6 +188,32 @@ static const struct file_operations pmc_dev_state_ops = { .release = single_release, }; +static int pmc_pss_state_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmc = s->private; + u32 pss = pmc_reg_read(pmc, PMC_PSS); + int pss_index; + + for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) { + seq_printf(s, "Island: %-32s\tState: %s\n", + pss_map[pss_index].name, + pss_map[pss_index].bit_mask & pss ? "Off" : "On"); + } + return 0; +} + +static int pmc_pss_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmc_pss_state_show, inode->i_private); +} + +static const struct file_operations pmc_pss_state_ops = { + .open = pmc_pss_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int pmc_sleep_tmr_show(struct seq_file *s, void *unused) { struct pmc_dev *pmc = s->private; @@ -202,11 +247,7 @@ static const struct file_operations pmc_sleep_tmr_ops = { static void pmc_dbgfs_unregister(struct pmc_dev *pmc) { - if (!pmc->dbgfs_dir) - return; - debugfs_remove_recursive(pmc->dbgfs_dir); - pmc->dbgfs_dir = NULL; } static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) @@ -217,19 +258,29 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) if (!dir) return -ENOMEM; + pmc->dbgfs_dir = dir; + f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, dir, pmc, &pmc_dev_state_ops); if (!f) { - dev_err(&pdev->dev, "dev_states register failed\n"); + dev_err(&pdev->dev, "dev_state register failed\n"); goto err; } + + f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO, + dir, pmc, &pmc_pss_state_ops); + if (!f) { + dev_err(&pdev->dev, "pss_state register failed\n"); + goto err; + } + f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO, dir, pmc, &pmc_sleep_tmr_ops); if (!f) { dev_err(&pdev->dev, "sleep_state register failed\n"); goto err; } - pmc->dbgfs_dir = dir; + return 0; err: pmc_dbgfs_unregister(pmc); @@ -292,7 +343,6 @@ MODULE_DEVICE_TABLE(pci, pmc_pci_ids); static int __init pmc_atom_init(void) { - int err = -ENODEV; struct pci_dev *pdev = NULL; const struct pci_device_id *ent; @@ -306,14 +356,11 @@ static int __init pmc_atom_init(void) */ for_each_pci_dev(pdev) { ent = pci_match_id(pmc_pci_ids, pdev); - if (ent) { - err = pmc_setup_dev(pdev); - goto out; - } + if (ent) + return pmc_setup_dev(pdev); } /* Device not found. */ -out: - return err; + return -ENODEV; } module_init(pmc_atom_init); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index ca9622a25e95..fe3dbfe0c4a5 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -170,7 +170,7 @@ static struct platform_device rtc_device = { static __init int add_rtc_cmos(void) { #ifdef CONFIG_PNP - static const char * const const ids[] __initconst = + static const char * const ids[] __initconst = { "PNP0b00", "PNP0b01", "PNP0b02", }; struct pnp_dev *dev; struct pnp_id *id; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16b6043cb073..0d8071d7addb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -438,15 +438,13 @@ static void __init parse_setup_data(void) pa_data = boot_params.hdr.setup_data; while (pa_data) { - u32 data_len, map_len, data_type; + u32 data_len, data_type; - map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), - (u64)sizeof(struct setup_data)); - data = early_memremap(pa_data, map_len); + data = early_memremap(pa_data, sizeof(*data)); data_len = data->len + sizeof(struct setup_data); data_type = data->type; pa_next = data->next; - early_iounmap(data, map_len); + early_iounmap(data, sizeof(*data)); switch (data_type) { case SETUP_E820_EXT: diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ed37a768d0fc..2a33c8f68319 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { user_exit(); -#ifdef CONFIG_X86_MCE - /* notify userspace of pending MCEs */ - if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_process(); -#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ - if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6d7022c683e3..febc6aabc72e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,7 +73,6 @@ #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> -#include <asm/smpboot_hooks.h> #include <asm/i8259.h> #include <asm/realmode.h> #include <asm/misc.h> @@ -104,6 +103,43 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; +static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) +{ + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0xa, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); + local_flush_tlb(); + pr_debug("1.\n"); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; + pr_debug("2.\n"); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; + pr_debug("3.\n"); +} + +static inline void smpboot_restore_warm_reset_vector(void) +{ + unsigned long flags; + + /* + * Install writable page 0 entry to set BIOS data area. + */ + local_flush_tlb(); + + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); + + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; +} + /* * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. @@ -136,8 +172,7 @@ static void smp_callin(void) * CPU, first the APIC. (this is probably redundant on most * boards) */ - setup_local_APIC(); - end_local_APIC_setup(); + apic_ap_setup(); /* * Need to setup vector mappings before we enable interrupts. @@ -955,9 +990,12 @@ void arch_disable_smp_support(void) */ static __init void disable_smp(void) { + pr_info("SMP disabled\n"); + + disable_ioapic_support(); + init_cpu_present(cpumask_of(0)); init_cpu_possible(cpumask_of(0)); - smpboot_clear_io_apic_irqs(); if (smp_found_config) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); @@ -967,6 +1005,13 @@ static __init void disable_smp(void) cpumask_set_cpu(0, cpu_core_mask(0)); } +enum { + SMP_OK, + SMP_NO_CONFIG, + SMP_NO_APIC, + SMP_FORCE_UP, +}; + /* * Various sanity checks. */ @@ -1014,10 +1059,7 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!smp_found_config && !acpi_lapic) { preempt_enable(); pr_notice("SMP motherboard not detected\n"); - disable_smp(); - if (APIC_init_uniprocessor()) - pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); - return -1; + return SMP_NO_CONFIG; } /* @@ -1041,9 +1083,7 @@ static int __init smp_sanity_check(unsigned max_cpus) boot_cpu_physical_apicid); pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); } - smpboot_clear_io_apic(); - disable_ioapic_support(); - return -1; + return SMP_NO_APIC; } verify_local_APIC(); @@ -1053,15 +1093,10 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (!max_cpus) { pr_info("SMP mode deactivated\n"); - smpboot_clear_io_apic(); - - connect_bsp_APIC(); - setup_local_APIC(); - bsp_end_local_APIC_setup(); - return -1; + return SMP_FORCE_UP; } - return 0; + return SMP_OK; } static void __init smp_cpu_index_default(void) @@ -1101,10 +1136,21 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); - if (smp_sanity_check(max_cpus) < 0) { - pr_info("SMP disabled\n"); + switch (smp_sanity_check(max_cpus)) { + case SMP_NO_CONFIG: disable_smp(); + if (APIC_init_uniprocessor()) + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return; + case SMP_NO_APIC: + disable_smp(); + return; + case SMP_FORCE_UP: + disable_smp(); + apic_bsp_setup(false); + return; + case SMP_OK: + break; } default_setup_apic_routing(); @@ -1115,33 +1161,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* Or can we switch back to PIC here? */ } - connect_bsp_APIC(); - - /* - * Switch from PIC to APIC mode. - */ - setup_local_APIC(); - - if (x2apic_mode) - cpu0_logical_apicid = apic_read(APIC_LDR); - else - cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - - /* - * Enable IO APIC before setting up error vector - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); - - bsp_end_local_APIC_setup(); - smpboot_setup_io_apic(); - /* - * Set up local APIC timer on boot CPU. - */ + cpu0_logical_apicid = apic_bsp_setup(false); pr_info("CPU%d: ", 0); print_cpu_info(&cpu_data(0)); - x86_init.timers.setup_percpu_clockev(); if (is_uv_system()) uv_system_init(); @@ -1177,9 +1200,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) nmi_selftest(); impress_friends(); -#ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -#endif mtrr_aps_init(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 88900e288021..9d2073e2ecc9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_count_dec(); } +enum ctx_state ist_enter(struct pt_regs *regs) +{ + enum ctx_state prev_state; + + if (user_mode_vm(regs)) { + /* Other than that, we're just an exception. */ + prev_state = exception_enter(); + } else { + /* + * We might have interrupted pretty much anything. In + * fact, if we're a machine check, we can even interrupt + * NMI processing. We don't want in_nmi() to return true, + * but we need to notify RCU. + */ + rcu_nmi_enter(); + prev_state = IN_KERNEL; /* the value is irrelevant. */ + } + + /* + * We are atomic because we're on the IST stack (or we're on x86_32, + * in which case we still shouldn't schedule). + * + * This must be after exception_enter(), because exception_enter() + * won't do anything if in_interrupt() returns true. + */ + preempt_count_add(HARDIRQ_OFFSET); + + /* This code is a bit fragile. Test it. */ + rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); + + return prev_state; +} + +void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) +{ + /* Must be before exception_exit. */ + preempt_count_sub(HARDIRQ_OFFSET); + + if (user_mode_vm(regs)) + return exception_exit(prev_state); + else + rcu_nmi_exit(); +} + +/** + * ist_begin_non_atomic() - begin a non-atomic section in an IST exception + * @regs: regs passed to the IST exception handler + * + * IST exception handlers normally cannot schedule. As a special + * exception, if the exception interrupted userspace code (i.e. + * user_mode_vm(regs) would return true) and the exception was not + * a double fault, it can be safe to schedule. ist_begin_non_atomic() + * begins a non-atomic section within an ist_enter()/ist_exit() region. + * Callers are responsible for enabling interrupts themselves inside + * the non-atomic section, and callers must call is_end_non_atomic() + * before ist_exit(). + */ +void ist_begin_non_atomic(struct pt_regs *regs) +{ + BUG_ON(!user_mode_vm(regs)); + + /* + * Sanity check: we need to be on the normal thread stack. This + * will catch asm bugs and any attempt to use ist_preempt_enable + * from double_fault. + */ + BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) + & ~(THREAD_SIZE - 1)) != 0); + + preempt_count_sub(HARDIRQ_OFFSET); +} + +/** + * ist_end_non_atomic() - begin a non-atomic section in an IST exception + * + * Ends a non-atomic section started with ist_begin_non_atomic(). + */ +void ist_end_non_atomic(void) +{ + preempt_count_add(HARDIRQ_OFFSET); +} + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) @@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * end up promoting it to a doublefault. In that case, modify * the stack to make it look like we just entered the #GP * handler from user space, similar to bad_iret. + * + * No need for ist_enter here because we don't use RCU. */ if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && @@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&normal_regs->orig_ax; + return; } #endif - exception_enter(); - /* Return not checked because double check cannot be ignored */ + ist_enter(regs); /* Discard prev_state because we won't return. */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) if (poke_int3_handler(regs)) return; - prev_state = exception_enter(); + prev_state = ist_enter(regs); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); debug_stack_usage_dec(); exit: - exception_exit(prev_state); + ist_exit(regs, prev_state); } NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* - * Help handler running on IST stack to switch back to user stack - * for scheduling or signal handling. The actual stack switch is done in - * entry.S + * Help handler running on IST stack to switch off the IST stack if the + * interrupted code was in user mode. The actual stack switch is done in + * entry_64.S */ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* - * Exception from kernel and interrupts are enabled. Move to - * kernel process stack. - */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; + struct pt_regs *regs = task_pt_regs(current); + *regs = *eregs; return regs; } NOKPROBE_SYMBOL(sync_regs); @@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; - prev_state = exception_enter(); + prev_state = ist_enter(regs); get_debugreg(dr6, 6); @@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - exception_exit(prev_state); + ist_exit(regs, prev_state); } NOKPROBE_SYMBOL(do_debug); @@ -788,18 +859,16 @@ void math_state_restore(void) local_irq_disable(); } + /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ + kernel_fpu_disable(); __thread_fpu_begin(tsk); - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ if (unlikely(restore_fpu_checking(tsk))) { drop_init_fpu(tsk); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); - return; + } else { + tsk->thread.fpu_counter++; } - - tsk->thread.fpu_counter++; + kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f9d16ff56c6b..7dc7ba577ecd 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,6 +40,7 @@ config KVM select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_VFIO + select SRCU ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 079c3b6a3ff1..7ff24240d863 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr, } } +static const char *page_size_string(struct map_range *mr) +{ + static const char str_1g[] = "1G"; + static const char str_2m[] = "2M"; + static const char str_4m[] = "4M"; + static const char str_4k[] = "4k"; + + if (mr->page_size_mask & (1<<PG_LEVEL_1G)) + return str_1g; + /* + * 32-bit without PAE has a 4M large page size. + * PG_LEVEL_2M is misnamed, but we can at least + * print out the right size in the string. + */ + if (IS_ENABLED(CONFIG_X86_32) && + !IS_ENABLED(CONFIG_X86_PAE) && + mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_4m; + + if (mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_2m; + + return str_4k; +} + static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) @@ -333,8 +358,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, - (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( - (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); + page_size_string(&mr[i])); return nr_range; } diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 44b9271580b5..852aa4c92da0 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -293,7 +293,6 @@ static void mrst_power_off_unused_dev(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev); /* diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 326198a4434e..676e5e04e4d4 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -610,6 +610,32 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) return 0; } +#ifdef CONFIG_ACPI_APEI +extern int (*arch_apei_filter_addr)(int (*func)(__u64 start, __u64 size, + void *data), void *data); + +static int pci_mmcfg_for_each_region(int (*func)(__u64 start, __u64 size, + void *data), void *data) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + rc = func(cfg->res.start, resource_size(&cfg->res), data); + if (rc) + return rc; + } + + return 0; +} +#define set_apei_filter() (arch_apei_filter_addr = pci_mmcfg_for_each_region) +#else +#define set_apei_filter() +#endif + static void __init __pci_mmcfg_init(int early) { pci_mmcfg_reject_broken(early); @@ -644,6 +670,8 @@ void __init pci_mmcfg_early_init(void) else acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); __pci_mmcfg_init(1); + + set_apei_filter(); } } diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 5fcda7272550..86d0f9e08dd9 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -91,167 +91,6 @@ ENTRY(efi_call) ret ENDPROC(efi_call) -#ifdef CONFIG_EFI_MIXED - -/* - * We run this function from the 1:1 mapping. - * - * This function must be invoked with a 1:1 mapped stack. - */ -ENTRY(__efi64_thunk) - movl %ds, %eax - push %rax - movl %es, %eax - push %rax - movl %ss, %eax - push %rax - - subq $32, %rsp - movl %esi, 0x0(%rsp) - movl %edx, 0x4(%rsp) - movl %ecx, 0x8(%rsp) - movq %r8, %rsi - movl %esi, 0xc(%rsp) - movq %r9, %rsi - movl %esi, 0x10(%rsp) - - sgdt save_gdt(%rip) - - leaq 1f(%rip), %rbx - movq %rbx, func_rt_ptr(%rip) - - /* Switch to gdt with 32-bit segments */ - movl 64(%rsp), %eax - lgdt (%rax) - - leaq efi_enter32(%rip), %rax - pushq $__KERNEL_CS - pushq %rax - lretq - -1: addq $32, %rsp - - lgdt save_gdt(%rip) - - pop %rbx - movl %ebx, %ss - pop %rbx - movl %ebx, %es - pop %rbx - movl %ebx, %ds - - /* - * Convert 32-bit status code into 64-bit. - */ - test %rax, %rax - jz 1f - movl %eax, %ecx - andl $0x0fffffff, %ecx - andl $0xf0000000, %eax - shl $32, %rax - or %rcx, %rax -1: - ret -ENDPROC(__efi64_thunk) - -ENTRY(efi_exit32) - movq func_rt_ptr(%rip), %rax - push %rax - mov %rdi, %rax - ret -ENDPROC(efi_exit32) - - .code32 -/* - * EFI service pointer must be in %edi. - * - * The stack should represent the 32-bit calling convention. - */ -ENTRY(efi_enter32) - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss - - /* Reload pgtables */ - movl %cr3, %eax - movl %eax, %cr3 - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Disable long mode via EFER */ - movl $MSR_EFER, %ecx - rdmsr - btrl $_EFER_LME, %eax - wrmsr - - call *%edi - - /* We must preserve return value */ - movl %eax, %edi - - /* - * Some firmware will return with interrupts enabled. Be sure to - * disable them before we switch GDTs. - */ - cli - - movl 68(%esp), %eax - movl %eax, 2(%eax) - lgdtl (%eax) - - movl %cr4, %eax - btsl $(X86_CR4_PAE_BIT), %eax - movl %eax, %cr4 - - movl %cr3, %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - wrmsr - - xorl %eax, %eax - lldt %ax - - movl 72(%esp), %eax - pushl $__KERNEL_CS - pushl %eax - - /* Enable paging */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - lret -ENDPROC(efi_enter32) - - .data - .balign 8 - .global efi32_boot_gdt -efi32_boot_gdt: .word 0 - .quad 0 - -save_gdt: .word 0 - .quad 0 -func_rt_ptr: .quad 0 - - .global efi_gdt64 -efi_gdt64: - .word efi_gdt64_end - efi_gdt64 - .long 0 /* Filled out by user */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -efi_gdt64_end: -#endif /* CONFIG_EFI_MIXED */ - .data ENTRY(efi_scratch) .fill 3,8,0 diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 8806fa73e6e6..ff85d28c50f2 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -1,9 +1,26 @@ /* * Copyright (C) 2014 Intel Corporation; author Matt Fleming + * + * Support for invoking 32-bit EFI runtime services from a 64-bit + * kernel. + * + * The below thunking functions are only used after ExitBootServices() + * has been called. This simplifies things considerably as compared with + * the early EFI thunking because we can leave all the kernel state + * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime + * services from __KERNEL32_CS. This means we can continue to service + * interrupts across an EFI mixed mode call. + * + * We do however, need to handle the fact that we're running in a full + * 64-bit virtual address space. Things like the stack and instruction + * addresses need to be accessible by the 32-bit firmware, so we rely on + * using the identity mappings in the EFI page table to access the stack + * and kernel text (see efi_setup_page_tables()). */ #include <linux/linkage.h> #include <asm/page_types.h> +#include <asm/segment.h> .text .code64 @@ -33,14 +50,6 @@ ENTRY(efi64_thunk) leaq efi_exit32(%rip), %rbx subq %rax, %rbx movl %ebx, 8(%rsp) - leaq efi_gdt64(%rip), %rbx - subq %rax, %rbx - movl %ebx, 2(%ebx) - movl %ebx, 4(%rsp) - leaq efi_gdt32(%rip), %rbx - subq %rax, %rbx - movl %ebx, 2(%ebx) - movl %ebx, (%rsp) leaq __efi64_thunk(%rip), %rbx subq %rax, %rbx @@ -52,14 +61,92 @@ ENTRY(efi64_thunk) retq ENDPROC(efi64_thunk) - .data -efi_gdt32: - .word efi_gdt32_end - efi_gdt32 - .long 0 /* Filled out above */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00cf9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf93000000ffff /* __KERNEL_DS */ -efi_gdt32_end: +/* + * We run this function from the 1:1 mapping. + * + * This function must be invoked with a 1:1 mapped stack. + */ +ENTRY(__efi64_thunk) + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + subq $32, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movq %r8, %rsi + movl %esi, 0xc(%rsp) + movq %r9, %rsi + movl %esi, 0x10(%rsp) + + leaq 1f(%rip), %rbx + movq %rbx, func_rt_ptr(%rip) + + /* Switch to 32-bit descriptor */ + pushq $__KERNEL32_CS + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $32, %rsp + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + /* + * Convert 32-bit status code into 64-bit. + */ + test %rax, %rax + jz 1f + movl %eax, %ecx + andl $0x0fffffff, %ecx + andl $0xf0000000, %eax + shl $32, %rax + or %rcx, %rax +1: + ret +ENDPROC(__efi64_thunk) + +ENTRY(efi_exit32) + movq func_rt_ptr(%rip), %rax + push %rax + mov %rdi, %rax + ret +ENDPROC(efi_exit32) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +ENTRY(efi_enter32) + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + movl 72(%esp), %eax + pushl $__KERNEL_CS + pushl %eax + + lret +ENDPROC(efi_enter32) + + .data + .balign 8 +func_rt_ptr: .quad 0 efi_saved_sp: .quad 0 diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 5a4affe025e8..09297c8e1fcd 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE PHONY += vdso_install $(vdso_img_insttargets) vdso_install: $(vdso_img_insttargets) FORCE -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* |