diff options
Diffstat (limited to 'arch/x86')
52 files changed, 1330 insertions, 590 deletions
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2c518fbc52ec..60a85768cfcb 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -382,14 +382,17 @@ config X86_OOSTORE # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. # Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). As a result, disallow these if we're -# compiling X86_GENERIC but not X86_64 (these NOPs do work on all -# x86-64 capable chips); the list of processors in the right-hand clause -# are the cores that benefit from this optimization. +# (which work on all CPUs). In addition, it looks like Virtual PC +# does not understand them. +# +# As a result, disallow these if we're not compiling for X86_64 (these +# NOPs do work on all x86-64 capable chips); the list of processors in +# the right-hand clause are the cores that benefit from this optimization. # config X86_P6_NOP def_bool y - depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) + depends on X86_64 + depends on (MCORE2 || MPENTIUM4 || MPSC) config X86_TSC def_bool y @@ -415,3 +418,21 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) + +config X86_DS + bool "Debug Store support" + default y + help + Add support for Debug Store. + This allows the kernel to provide a memory buffer to the hardware + to store various profiling and tracing events. + +config X86_PTRACE_BTS + bool "ptrace interface to Branch Trace Store" + default y + depends on (X86_DS && X86_DEBUGCTLMSR) + help + Add a ptrace interface to allow collecting an execution trace + of the traced task. + This collects control flow changes in a (cyclic) buffer and allows + debuggers to fill in the gaps and show an execution trace of the debuggee. diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 9fea73706479..aaf5a2131efc 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -16,7 +16,7 @@ */ #undef CONFIG_PARAVIRT #ifdef CONFIG_X86_32 -#define _ASM_DESC_H_ 1 +#define ASM_X86__DESC_H 1 #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index a1310c52fc0c..857e492c571e 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) continue; } sh_symtab = sec_symtab->symtab; - sym_strtab = sec->link->strtab; + sym_strtab = sec_symtab->link->strtab; for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { Elf32_Rel *rel; Elf32_Sym *sym; diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4b9ae7c56748..4d3ff037201f 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -38,12 +38,12 @@ static const u32 req_flags[NCAPINTS] = { REQUIRED_MASK0, REQUIRED_MASK1, - REQUIRED_MASK2, - REQUIRED_MASK3, + 0, /* REQUIRED_MASK2 not implemented in this file */ + 0, /* REQUIRED_MASK3 not implemented in this file */ REQUIRED_MASK4, - REQUIRED_MASK5, + 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, - REQUIRED_MASK7, + 0, /* REQUIRED_MASK7 not implemented in this file */ }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b553..65a0c1b48696 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 69b4d060b21c..042fdc27bc92 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -101,10 +101,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret, ready = 0; + int ret = 0, ready = 0; unsigned status = 0; struct iommu_cmd cmd; - unsigned long i = 0; + unsigned long flags, i = 0; memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; @@ -112,10 +112,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu) iommu->need_sync = 0; - ret = iommu_queue_command(iommu, &cmd); + spin_lock_irqsave(&iommu->lock, flags); + + ret = __iommu_queue_command(iommu, &cmd); if (ret) - return ret; + goto out; while (!ready && (i < EXIT_LOOP_COUNT)) { ++i; @@ -130,6 +132,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); +out: + spin_unlock_irqrestore(&iommu->lock, flags); return 0; } @@ -140,6 +144,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { struct iommu_cmd cmd; + int ret; BUG_ON(iommu == NULL); @@ -147,9 +152,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } /* @@ -159,6 +166,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { struct iommu_cmd cmd; + int ret; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; @@ -171,9 +179,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } /* diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9ee24e6bc4b0..732d1f4e10ee 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -234,6 +234,7 @@ #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/i8253.h> +#include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> @@ -2217,7 +2218,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled()) { + if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index aa89387006fe..505543a75a56 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -22,7 +22,7 @@ #define __NO_STUBS 1 #undef __SYSCALL -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H #define __SYSCALL(nr, sym) [nr] = 1, static char syscalls[] = { #include <asm/unistd.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc3031..18514ed26104 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -166,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mbytes); } - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); break; } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a55..a0534c04d38a 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -314,6 +314,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..4e456bd955bb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/pat.h> +#include <asm/asm.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> #include <asm/apic.h> @@ -334,11 +335,24 @@ static void __init early_cpu_detect(void) get_cpu_vendor(c, 1); + early_get_cap(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); +} - early_get_cap(c); +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6; unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. In the latter case it doesn't even *fail* + * reliably, so probing for it doesn't even work. Disable it completely + * unless we can find a reliable way to detect all the broken cases. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + clear_cpu_cap(c, X86_FEATURE_NOPL); } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) @@ -395,8 +409,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) } init_scattered_cpuid_features(c); + detect_nopl(c); } - } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index dd6e3f15017e..a11f5d4477cd 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -18,6 +18,7 @@ #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/pat.h> +#include <asm/asm.h> #include <asm/numa.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> @@ -215,6 +216,39 @@ static void __init early_cpu_support_print(void) } } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); void __init early_cpu_init(void) @@ -313,6 +347,8 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } + detect_nopl(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); @@ -493,17 +529,20 @@ void pda_init(int cpu) /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) pda->nodenumber = cpu_to_node(cpu); } - - pda->irqstackptr += IRQSTACKSIZE-64; } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + @@ -601,19 +640,22 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (!orig_ist->ist[0]) { static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index e710a21bb6e8..898a5a2002ed 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -161,6 +166,24 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -416,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, }; diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cba..c9017799497c 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = { NULL, NULL, NULL, NULL, "constant_tsc", "up", NULL, "arch_perfmon", "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "rep_good", NULL, NULL, NULL, + "nopl", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b75f2569b8f8..f113ef4595f6 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -222,10 +222,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) set_cpu_cap(c, X86_FEATURE_PEBS); + ds_init_intel(c); } if (cpu_has_bts) - ds_init_intel(c); + ptrace_bts_init_intel(c); /* * See if we have a good local APIC by checking for buggy Pentia, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b117d7f8a564..5df16d818371 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -729,7 +729,7 @@ struct var_mtrr_range_state { mtrr_type type; }; -struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; static int __init @@ -834,7 +834,7 @@ static int __init enable_mtrr_cleanup_setup(char *str) enable_mtrr_cleanup = 1; return 0; } -early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); struct var_mtrr_state { unsigned long range_startk; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 11c11b8ec48d..2b69994fd3a8 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -2,26 +2,49 @@ * Debug Store support * * This provides a low-level interface to the hardware's Debug Store - * feature that is used for last branch recording (LBR) and + * feature that is used for branch trace store (BTS) and * precise-event based sampling (PEBS). * - * Different architectures use a different DS layout/pointer size. - * The below functions therefore work on a void*. + * It manages: + * - per-thread and per-cpu allocation of BTS and PEBS + * - buffer memory allocation (optional) + * - buffer overflow handling + * - buffer access * + * It assumes: + * - get_task_struct on all parameter tasks + * - current is allowed to trace parameter tasks * - * Since there is no user for PEBS, yet, only LBR (or branch - * trace store, BTS) is supported. * - * - * Copyright (C) 2007 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + * Copyright (C) 2007-2008 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 */ + +#ifdef CONFIG_X86_DS + #include <asm/ds.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mm.h> + + +/* + * The configuration for a particular DS hardware implementation. + */ +struct ds_configuration { + /* the size of the DS structure in bytes */ + unsigned char sizeof_ds; + /* the size of one pointer-typed field in the DS structure in bytes; + this covers the first 8 fields related to buffer management. */ + unsigned char sizeof_field; + /* the size of a BTS/PEBS record in bytes */ + unsigned char sizeof_rec[2]; +}; +static struct ds_configuration ds_cfg; /* @@ -44,378 +67,747 @@ * (interrupt occurs when write pointer passes interrupt pointer) * - value to which counter is reset following counter overflow * - * On later architectures, the last branch recording hardware uses - * 64bit pointers even in 32bit mode. - * - * - * Branch Trace Store (BTS) records store information about control - * flow changes. They at least provide the following information: - * - source linear address - * - destination linear address + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. * - * Netburst supported a predicated bit that had been dropped in later - * architectures. We do not suppor it. * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * - an offset giving the start of the respective region * - * In order to abstract from the actual DS and BTS layout, we describe - * the access to the relevant fields. - * Thanks to Andi Kleen for proposing this design. + * This offset is further used to index various arrays holding + * information for BTS and PEBS at the respective index. * - * The implementation, however, is not as general as it might seem. In - * order to stay somewhat simple and efficient, we assume an - * underlying unsigned type (mostly a pointer type) and we expect the - * field to be at least as big as that type. + * On later 32bit processors, we only access the lower 32bit of the + * 64bit pointer fields. The upper halves will be zeroed out. */ -/* - * A special from_ip address to indicate that the BTS record is an - * info record that needs to be interpreted or skipped. - */ -#define BTS_ESCAPE_ADDRESS (-1) +enum ds_field { + ds_buffer_base = 0, + ds_index, + ds_absolute_maximum, + ds_interrupt_threshold, +}; -/* - * A field access descriptor - */ -struct access_desc { - unsigned char offset; - unsigned char size; +enum ds_qualifier { + ds_bts = 0, + ds_pebs }; +static inline unsigned long ds_get(const unsigned char *base, + enum ds_qualifier qual, enum ds_field field) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + return *(unsigned long *)base; +} + +static inline void ds_set(unsigned char *base, enum ds_qualifier qual, + enum ds_field field, unsigned long value) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + (*(unsigned long *)base) = value; +} + + /* - * The configuration for a particular DS/BTS hardware implementation. + * Locking is done only for allocating BTS or PEBS resources and for + * guarding context and buffer memory allocation. + * + * Most functions require the current task to own the ds context part + * they are going to access. All the locking is done when validating + * access to the context. */ -struct ds_configuration { - /* the DS configuration */ - unsigned char sizeof_ds; - struct access_desc bts_buffer_base; - struct access_desc bts_index; - struct access_desc bts_absolute_maximum; - struct access_desc bts_interrupt_threshold; - /* the BTS configuration */ - unsigned char sizeof_bts; - struct access_desc from_ip; - struct access_desc to_ip; - /* BTS variants used to store additional information like - timestamps */ - struct access_desc info_type; - struct access_desc info_data; - unsigned long debugctl_mask; -}; +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); /* - * The global configuration used by the below accessor functions + * Validate that the current task is allowed to access the BTS/PEBS + * buffer of the parameter task. + * + * Returns 0, if access is granted; -Eerrno, otherwise. */ -static struct ds_configuration ds_cfg; +static inline int ds_validate_access(struct ds_context *context, + enum ds_qualifier qual) +{ + if (!context) + return -EPERM; + + if (context->owner[qual] == current) + return 0; + + return -EPERM; +} + /* - * Accessor functions for some DS and BTS fields using the above - * global ptrace_bts_cfg. + * We either support (system-wide) per-cpu or per-thread allocation. + * We distinguish the two based on the task_struct pointer, where a + * NULL pointer indicates per-cpu allocation for the current cpu. + * + * Allocations are use-counted. As soon as resources are allocated, + * further allocations must be of the same type (per-cpu or + * per-thread). We model this by counting allocations (i.e. the number + * of tracers of a certain type) for one type negatively: + * =0 no tracers + * >0 number of per-thread tracers + * <0 number of per-cpu tracers + * + * The below functions to get and put tracers and to check the + * allocation type require the ds_lock to be held by the caller. + * + * Tracers essentially gives the number of ds contexts for a certain + * type of allocation. */ -static inline unsigned long get_bts_buffer_base(char *base) +static long tracers; + +static inline void get_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); + tracers += (task ? 1 : -1); } -static inline void set_bts_buffer_base(char *base, unsigned long value) + +static inline void put_tracer(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; + tracers -= (task ? 1 : -1); } -static inline unsigned long get_bts_index(char *base) + +static inline int check_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_index.offset); + return (task ? (tracers >= 0) : (tracers <= 0)); } -static inline void set_bts_index(char *base, unsigned long value) + + +/* + * The DS context is either attached to a thread or to a cpu: + * - in the former case, the thread_struct contains a pointer to the + * attached context. + * - in the latter case, we use a static array of per-cpu context + * pointers. + * + * Contexts are use-counted. They are allocated on first access and + * deallocated when the last user puts the context. + * + * We distinguish between an allocating and a non-allocating get of a + * context: + * - the allocating get is used for requesting BTS/PEBS resources. It + * requires the caller to hold the global ds_lock. + * - the non-allocating get is used for all other cases. A + * non-existing context indicates an error. It acquires and releases + * the ds_lock itself for obtaining the context. + * + * A context and its DS configuration are allocated and deallocated + * together. A context always has a DS configuration of the + * appropriate size. + */ +static DEFINE_PER_CPU(struct ds_context *, system_context); + +#define this_system_context per_cpu(system_context, smp_processor_id()) + +/* + * Returns the pointer to the parameter task's context or to the + * system-wide context, if task is NULL. + * + * Increases the use count of the returned context, if not NULL. + */ +static inline struct ds_context *ds_get_context(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; + struct ds_context *context; + + spin_lock(&ds_lock); + + context = (task ? task->thread.ds_ctx : this_system_context); + if (context) + context->count++; + + spin_unlock(&ds_lock); + + return context; } -static inline unsigned long get_bts_absolute_maximum(char *base) + +/* + * Same as ds_get_context, but allocates the context and it's DS + * structure, if necessary; returns NULL; if out of memory. + * + * pre: requires ds_lock to be held + */ +static inline struct ds_context *ds_alloc_context(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); + struct ds_context **p_context = + (task ? &task->thread.ds_ctx : &this_system_context); + struct ds_context *context = *p_context; + + if (!context) { + context = kzalloc(sizeof(*context), GFP_KERNEL); + + if (!context) + return NULL; + + context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + if (!context->ds) { + kfree(context); + return NULL; + } + + *p_context = context; + + context->this = p_context; + context->task = task; + + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + + if (!task || (task == current)) + wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); + + get_tracer(task); + } + + context->count++; + + return context; } -static inline void set_bts_absolute_maximum(char *base, unsigned long value) + +/* + * Decreases the use count of the parameter context, if not NULL. + * Deallocates the context, if the use count reaches zero. + */ +static inline void ds_put_context(struct ds_context *context) { - (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; + if (!context) + return; + + spin_lock(&ds_lock); + + if (--context->count) + goto out; + + *(context->this) = NULL; + + if (context->task) + clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + + if (!context->task || (context->task == current)) + wrmsrl(MSR_IA32_DS_AREA, 0); + + put_tracer(context->task); + + /* free any leftover buffers from tracers that did not + * deallocate them properly. */ + kfree(context->buffer[ds_bts]); + kfree(context->buffer[ds_pebs]); + kfree(context->ds); + kfree(context); + out: + spin_unlock(&ds_lock); } -static inline unsigned long get_bts_interrupt_threshold(char *base) + + +/* + * Handle a buffer overflow + * + * task: the task whose buffers are overflowing; + * NULL for a buffer overflow on the current cpu + * context: the ds context + * qual: the buffer type + */ +static void ds_overflow(struct task_struct *task, struct ds_context *context, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); + if (!context) + return; + + if (context->callback[qual]) + (*context->callback[qual])(task); + + /* todo: do some more overflow handling */ } -static inline void set_bts_interrupt_threshold(char *base, unsigned long value) + + +/* + * Allocate a non-pageable buffer of the parameter size. + * Checks the memory and the locked memory rlimit. + * + * Returns the buffer, if successful; + * NULL, if out of memory or rlimit exceeded. + * + * size: the requested buffer size in bytes + * pages (out): if not NULL, contains the number of pages reserved + */ +static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { - (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; + unsigned long rlim, vm, pgsz; + void *buffer; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + pgsz; + if (rlim < vm) + return NULL; + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + pgsz; + if (rlim < vm) + return NULL; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + return NULL; + + current->mm->total_vm += pgsz; + current->mm->locked_vm += pgsz; + + if (pages) + *pages = pgsz; + + return buffer; } -static inline unsigned long get_from_ip(char *base) + +static int ds_request(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl, enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.from_ip.offset); + struct ds_context *context; + unsigned long buffer, adj; + const unsigned long alignment = (1 << 3); + int error = 0; + + if (!ds_cfg.sizeof_ds) + return -EOPNOTSUPP; + + /* we require some space to do alignment adjustments below */ + if (size < (alignment + ds_cfg.sizeof_rec[qual])) + return -EINVAL; + + /* buffer overflow notification is not yet implemented */ + if (ovfl) + return -EOPNOTSUPP; + + + spin_lock(&ds_lock); + + if (!check_tracer(task)) + return -EPERM; + + error = -ENOMEM; + context = ds_alloc_context(task); + if (!context) + goto out_unlock; + + error = -EALREADY; + if (context->owner[qual] == current) + goto out_unlock; + error = -EPERM; + if (context->owner[qual] != NULL) + goto out_unlock; + context->owner[qual] = current; + + spin_unlock(&ds_lock); + + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &context->pages[qual]); + if (!base) + goto out_release; + + context->buffer[qual] = base; + } + error = 0; + + context->callback[qual] = ovfl; + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, alignment) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + if (ovfl) { + /* todo: select a suitable interrupt threshold */ + } else + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size + 1); + + /* we keep the context until ds_release */ + return error; + + out_release: + context->owner[qual] = NULL; + ds_put_context(context); + return error; + + out_unlock: + spin_unlock(&ds_lock); + ds_put_context(context); + return error; } -static inline void set_from_ip(char *base, unsigned long value) + +int ds_request_bts(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; + return ds_request(task, base, size, ovfl, ds_bts); } -static inline unsigned long get_to_ip(char *base) + +int ds_request_pebs(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - return *(unsigned long *)(base + ds_cfg.to_ip.offset); + return ds_request(task, base, size, ovfl, ds_pebs); } -static inline void set_to_ip(char *base, unsigned long value) + +static int ds_release(struct task_struct *task, enum ds_qualifier qual) { - (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; + struct ds_context *context; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + kfree(context->buffer[qual]); + context->buffer[qual] = NULL; + + current->mm->total_vm -= context->pages[qual]; + current->mm->locked_vm -= context->pages[qual]; + context->pages[qual] = 0; + context->owner[qual] = NULL; + + /* + * we put the context twice: + * once for the ds_get_context + * once for the corresponding ds_request + */ + ds_put_context(context); + out: + ds_put_context(context); + return error; } -static inline unsigned char get_info_type(char *base) + +int ds_release_bts(struct task_struct *task) { - return *(unsigned char *)(base + ds_cfg.info_type.offset); + return ds_release(task, ds_bts); } -static inline void set_info_type(char *base, unsigned char value) + +int ds_release_pebs(struct task_struct *task) { - (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; + return ds_release(task, ds_pebs); } -static inline unsigned long get_info_data(char *base) + +static int ds_get_index(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.info_data.offset); + struct ds_context *context; + unsigned long base, index; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + + error = ((index - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; } -static inline void set_info_data(char *base, unsigned long value) + +int ds_get_bts_index(struct task_struct *task, size_t *pos) { - (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; + return ds_get_index(task, pos, ds_bts); } +int ds_get_pebs_index(struct task_struct *task, size_t *pos) +{ + return ds_get_index(task, pos, ds_pebs); +} -int ds_allocate(void **dsp, size_t bts_size_in_bytes) +static int ds_get_end(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - size_t bts_size_in_records; - unsigned long bts; - void *ds; + struct ds_context *context; + unsigned long base, end; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); + + error = ((end - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +int ds_get_bts_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_bts); +} - if (bts_size_in_bytes < 0) - return -EINVAL; +int ds_get_pebs_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_pebs); +} - bts_size_in_records = - bts_size_in_bytes / ds_cfg.sizeof_bts; - bts_size_in_bytes = - bts_size_in_records * ds_cfg.sizeof_bts; +static int ds_access(struct task_struct *task, size_t index, + const void **record, enum ds_qualifier qual) +{ + struct ds_context *context; + unsigned long base, idx; + int error; - if (bts_size_in_bytes <= 0) + if (!record) return -EINVAL; - bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); - - if (!bts) - return -ENOMEM; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + base = ds_get(context->ds, qual, ds_buffer_base); + idx = base + (index * ds_cfg.sizeof_rec[qual]); - if (!ds) { - kfree((void *)bts); - return -ENOMEM; - } - - set_bts_buffer_base(ds, bts); - set_bts_index(ds, bts); - set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); - set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); + error = -EINVAL; + if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) + goto out; - *dsp = ds; - return 0; + *record = (const void *)idx; + error = ds_cfg.sizeof_rec[qual]; + out: + ds_put_context(context); + return error; } -int ds_free(void **dsp) +int ds_access_bts(struct task_struct *task, size_t index, const void **record) { - if (*dsp) { - kfree((void *)get_bts_buffer_base(*dsp)); - kfree(*dsp); - *dsp = NULL; - } - return 0; + return ds_access(task, index, record, ds_bts); } -int ds_get_bts_size(void *ds) +int ds_access_pebs(struct task_struct *task, size_t index, const void **record) { - int size_in_bytes; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (!ds) - return 0; - - size_in_bytes = - get_bts_absolute_maximum(ds) - - get_bts_buffer_base(ds); - return size_in_bytes; + return ds_access(task, index, record, ds_pebs); } -int ds_get_bts_end(void *ds) +static int ds_write(struct task_struct *task, const void *record, size_t size, + enum ds_qualifier qual, int force) { - int size_in_bytes = ds_get_bts_size(ds); - - if (size_in_bytes <= 0) - return size_in_bytes; + struct ds_context *context; + int error; - return size_in_bytes / ds_cfg.sizeof_bts; -} + if (!record) + return -EINVAL; -int ds_get_bts_index(void *ds) -{ - int index_offset_in_bytes; + error = -EPERM; + context = ds_get_context(task); + if (!context) + goto out; - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; + if (!force) { + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + } - index_offset_in_bytes = - get_bts_index(ds) - - get_bts_buffer_base(ds); + error = 0; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + goto out; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + error += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(task, context, qual); + } - return index_offset_in_bytes / ds_cfg.sizeof_bts; + out: + ds_put_context(context); + return error; } -int ds_set_overflow(void *ds, int method) +int ds_write_bts(struct task_struct *task, const void *record, size_t size) { - switch (method) { - case DS_O_SIGNAL: - return -EOPNOTSUPP; - case DS_O_WRAP: - return 0; - default: - return -EINVAL; - } + return ds_write(task, record, size, ds_bts, /* force = */ 0); } -int ds_get_overflow(void *ds) +int ds_write_pebs(struct task_struct *task, const void *record, size_t size) { - return DS_O_WRAP; + return ds_write(task, record, size, ds_pebs, /* force = */ 0); } -int ds_clear(void *ds) +int ds_unchecked_write_bts(struct task_struct *task, + const void *record, size_t size) { - int bts_size = ds_get_bts_size(ds); - unsigned long bts_base; - - if (bts_size <= 0) - return bts_size; - - bts_base = get_bts_buffer_base(ds); - memset((void *)bts_base, 0, bts_size); - - set_bts_index(ds, bts_base); - return 0; + return ds_write(task, record, size, ds_bts, /* force = */ 1); } -int ds_read_bts(void *ds, int index, struct bts_struct *out) +int ds_unchecked_write_pebs(struct task_struct *task, + const void *record, size_t size) { - void *bts; + return ds_write(task, record, size, ds_pebs, /* force = */ 1); +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +static int ds_reset_or_clear(struct task_struct *task, + enum ds_qualifier qual, int clear) +{ + struct ds_context *context; + unsigned long base, end; + int error; - if (index < 0) - return -EINVAL; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - if (index >= ds_get_bts_size(ds)) - return -EINVAL; + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); - bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + if (clear) + memset((void *)base, 0, end - base); - memset(out, 0, sizeof(*out)); - if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { - out->qualifier = get_info_type(bts); - out->variant.jiffies = get_info_data(bts); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = get_from_ip(bts); - out->variant.lbr.to_ip = get_to_ip(bts); - } + ds_set(context->ds, qual, ds_index, base); - return sizeof(*out);; + error = 0; + out: + ds_put_context(context); + return error; } -int ds_write_bts(void *ds, const struct bts_struct *in) +int ds_reset_bts(struct task_struct *task) { - unsigned long bts; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (ds_get_bts_size(ds) <= 0) - return -ENXIO; + return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +} - bts = get_bts_index(ds); +int ds_reset_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +} - memset((void *)bts, 0, ds_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; +int ds_clear_bts(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +} - case BTS_BRANCH: - set_from_ip((void *)bts, in->variant.lbr.from_ip); - set_to_ip((void *)bts, in->variant.lbr.to_ip); - break; +int ds_clear_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +} - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); - set_info_type((void *)bts, in->qualifier); - set_info_data((void *)bts, in->variant.jiffies); - break; +int ds_get_pebs_reset(struct task_struct *task, u64 *value) +{ + struct ds_context *context; + int error; - default: + if (!value) return -EINVAL; - } - bts = bts + ds_cfg.sizeof_bts; - if (bts >= get_bts_absolute_maximum(ds)) - bts = get_bts_buffer_base(ds); - set_bts_index(ds, bts); + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; - return ds_cfg.sizeof_bts; + *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + + error = 0; + out: + ds_put_context(context); + return error; } -unsigned long ds_debugctl_mask(void) +int ds_set_pebs_reset(struct task_struct *task, u64 value) { - return ds_cfg.debugctl_mask; -} + struct ds_context *context; + int error; -#ifdef __i386__ -static const struct ds_configuration ds_cfg_netburst = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<2)|(1<<3) -}; + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; -static const struct ds_configuration ds_cfg_pentium_m = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<6)|(1<<7) + *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + + error = 0; + out: + ds_put_context(context); + return error; +} + +static const struct ds_configuration ds_cfg_var = { + .sizeof_ds = sizeof(long) * 12, + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, + .sizeof_rec[ds_pebs] = sizeof(long) * 10 }; -#endif /* _i386_ */ - -static const struct ds_configuration ds_cfg_core2 = { - .sizeof_ds = 9 * 8, - .bts_buffer_base = { 0, 8 }, - .bts_index = { 8, 8 }, - .bts_absolute_maximum = { 16, 8 }, - .bts_interrupt_threshold = { 24, 8 }, - .sizeof_bts = 3 * 8, - .from_ip = { 0, 8 }, - .to_ip = { 8, 8 }, - .info_type = { 8, 1 }, - .info_data = { 16, 8 }, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +static const struct ds_configuration ds_cfg_64 = { + .sizeof_ds = 8 * 12, + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 10 }; static inline void @@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { -#ifdef __i386__ case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ case 0xF: /* Core2 */ - ds_configure(&ds_cfg_core2); + case 0x1C: /* Atom */ + ds_configure(&ds_cfg_64); break; default: /* sorry, don't know about them */ @@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xF: switch (c->x86_model) { -#ifdef __i386__ case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ default: /* sorry, don't know about them */ break; @@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; } } + +void ds_free(struct ds_context *context) +{ + /* This is called when the task owning the parameter context + * is dying. There should not be any user of that context left + * to disturb us, anymore. */ + unsigned long leftovers = context->count; + while (leftovers--) + ds_put_context(context); +} +#endif /* CONFIG_X86_DS */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7bb..66e48aa2dd1b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1203,7 +1203,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 59fd3b6b1303..73deaffadd03 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); + /* 5 usec minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = 5000; /* * Start hpet with the boot cpu mask and make it @@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long cnt; + u32 cnt; cnt = hpet_readl(HPET_COUNTER); - cnt += delta; + cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); - return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; + /* + * We need to read back the CMP register to make sure that + * what we wrote hit the chip before we compare it to the + * counter. + */ + WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); + + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } /* diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 1c3a66a67f83..720d2607aacb 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, { } }; diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 50e5e4a31c85..191914302744 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/syscalls.h> +#include <asm/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 3f7537b669d3..f1c688e46f35 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -20,6 +20,8 @@ #ifdef CONFIG_X86_32 #include <mach_apic.h> +#include <mach_ipi.h> + /* * the following functions deal with sending IPIs between CPUs. * @@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) } /* must come after the send_IPI functions above for inlining */ -#include <mach_ipi.h> static int convert_apicid_to_cpu(int apic_id) { int i; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index f2d43bc75514..ff7d3b0124f1 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) if (PageHighMem(pg)) { data = ioremap_cache(pa_data, sizeof(*data)); if (!data) { + kfree(node); error = -ENXIO; goto err_dir; } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f47f0eb886b8..8282a2139681 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -69,6 +69,9 @@ static int gdb_x86vector = -1; */ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = regs->ax; gdb_regs[GDB_BX] = regs->bx; gdb_regs[GDB_CX] = regs->cx; @@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SI] = regs->si; gdb_regs[GDB_DI] = regs->di; gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_PC] = regs->ip; #ifdef CONFIG_X86_32 + gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; @@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_R13] = regs->r13; gdb_regs[GDB_R14] = regs->r14; gdb_regs[GDB_R15] = regs->r15; + gdb_regs32[GDB_PS] = regs->flags; + gdb_regs32[GDB_CS] = regs->cs; + gdb_regs32[GDB_SS] = regs->ss; #endif gdb_regs[GDB_SP] = regs->sp; } @@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = 0; gdb_regs[GDB_BX] = 0; gdb_regs[GDB_CX] = 0; @@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); - gdb_regs[GDB_PC] = 0; + gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) */ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif regs->ax = gdb_regs[GDB_AX]; regs->bx = gdb_regs[GDB_BX]; regs->cx = gdb_regs[GDB_CX]; @@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->si = gdb_regs[GDB_SI]; regs->di = gdb_regs[GDB_DI]; regs->bp = gdb_regs[GDB_BP]; - regs->flags = gdb_regs[GDB_PS]; regs->ip = gdb_regs[GDB_PC]; #ifdef CONFIG_X86_32 + regs->flags = gdb_regs[GDB_PS]; regs->ds = gdb_regs[GDB_DS]; regs->es = gdb_regs[GDB_ES]; regs->cs = gdb_regs[GDB_CS]; @@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->r13 = gdb_regs[GDB_R13]; regs->r14 = gdb_regs[GDB_R14]; regs->r15 = gdb_regs[GDB_R15]; + regs->flags = gdb_regs32[GDB_PS]; + regs->cs = gdb_regs32[GDB_CS]; + regs->ss = gdb_regs32[GDB_SS]; #endif } @@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; kgdb_single_step = 1; - if (kgdb_contthread) { - atomic_set(&kgdb_cpu_doing_single_step, - raw_smp_processor_id()); - } + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); } get_debugreg(dr6, 6); @@ -466,9 +481,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id() && - user_mode(regs)) - return single_step_cont(regs, args); + raw_smp_processor_id()) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; /* fall through */ default: if (user_mode(regs)) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b68e21f06f4f..0ed5f939b905 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -18,6 +18,7 @@ #include <asm/ldt.h> #include <asm/desc.h> #include <asm/mmu_context.h> +#include <asm/syscalls.h> #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a0..876e91890777 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } +static cpumask_t c1e_mask = CPU_MASK_NONE; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + cpu_clear(cpu, c1e_mask); +} + /* * C1E aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same @@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) */ static void c1e_idle(void) { - static cpumask_t c1e_mask = CPU_MASK_NONE; - static int c1e_detected; - if (need_resched()) return; @@ -265,8 +270,10 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - mark_tsc_unstable("TSC halt in C1E"); - printk(KERN_INFO "System has C1E enabled\n"); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0bc..c8609dea443f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,9 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> #include <asm/kdebug.h> +#include <asm/idle.h> +#include <asm/syscalls.h> +#include <asm/smp.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -88,6 +91,7 @@ static void cpu_exit_clear(void) cpu_clear(cpu, cpu_callin_map); numa_remove_cpu(cpu); + c1e_remove_cpu(cpu); } /* We don't actually take CPU down, just spin without interrupts. */ @@ -275,6 +279,14 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(current->thread.ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(current->thread.ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -436,6 +448,35 @@ int set_tsc_mode(unsigned int val) return 0; } +#ifdef CONFIG_X86_DS +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + unsigned long ds_prev = 0; + unsigned long ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + update_debugctlmsr(0); + wrmsr(MSR_IA32_DS_AREA, ds_next, 0); + } + return debugctl; +} +#else +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + return debugctl; +} +#endif /* CONFIG_X86_DS */ + static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) @@ -446,14 +487,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; - debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); - } + debugctl = update_debugctl(prev, next, prev->debugctlmsr); if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -477,13 +511,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2a..79e3e173ab40 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,7 @@ #include <asm/proto.h> #include <asm/ia32.h> #include <asm/idle.h> +#include <asm/syscalls.h> asmlinkage extern void ret_from_fork(void); @@ -93,6 +94,8 @@ DECLARE_PER_CPU(int, cpu_state); static inline void play_dead(void) { idle_task_exit(); + c1e_remove_cpu(raw_smp_processor_id()); + mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; @@ -238,6 +241,14 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(t->ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(t->ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -471,13 +482,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, next = &next_p->thread; debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + +#ifdef CONFIG_X86_DS + { + unsigned long ds_prev = 0, ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* + * We clear debugctl to make sure DS + * is not in use when we change it: + */ + debugctl = 0; + update_debugctlmsr(0); + wrmsrl(MSR_IA32_DS_AREA, ds_next); + } } +#endif /* CONFIG_X86_DS */ if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -515,13 +540,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 19a7d2c40560..e375b658efc3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -70,7 +70,7 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 -static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); regno >>= 2; @@ -555,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS +/* + * The configuration for a particular BTS hardware implementation. + */ +struct bts_configuration { + /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ + unsigned char sizeof_bts; + /* the size of a field in the BTS record in bytes */ + unsigned char sizeof_field; + /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ + unsigned long debugctl_mask; +}; +static struct bts_configuration bts_cfg; + +#define BTS_MAX_RECORD_SIZE (8 * 3) + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + */ + +enum bts_field { + bts_from = 0, + bts_to, + bts_flags, + + bts_escape = (unsigned long)-1, + bts_qual = bts_to, + bts_jiffies = bts_flags +}; -static int ptrace_bts_get_size(struct task_struct *child) +static inline unsigned long bts_get(const char *base, enum bts_field field) { - if (!child->thread.ds_area_msr) - return -ENXIO; + base += (bts_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (bts_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} - return ds_get_bts_index((void *)child->thread.ds_area_msr); +/* + * Translate a BTS record from the raw format into the bts_struct format + * + * out (out): bts_struct interpretation + * raw: raw BTS record + */ +static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) +{ + memset(out, 0, sizeof(*out)); + if (bts_get(raw, bts_from) == bts_escape) { + out->qualifier = bts_get(raw, bts_qual); + out->variant.jiffies = bts_get(raw, bts_jiffies); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = bts_get(raw, bts_from); + out->variant.lbr.to_ip = bts_get(raw, bts_to); + } } -static int ptrace_bts_read_record(struct task_struct *child, - long index, +static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { struct bts_struct ret; - int retval; - int bts_end; - int bts_index; + const void *bts_record; + size_t bts_index, bts_end; + int error; - if (!child->thread.ds_area_msr) - return -ENXIO; + error = ds_get_bts_end(child, &bts_end); + if (error < 0) + return error; - if (index < 0) - return -EINVAL; - - bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); if (bts_end <= index) return -EINVAL; + error = ds_get_bts_index(child, &bts_index); + if (error < 0) + return error; + /* translate the ptrace bts index into the ds bts index */ - bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); - bts_index -= (index + 1); - if (bts_index < 0) - bts_index += bts_end; + bts_index += bts_end - (index + 1); + if (bts_end <= bts_index) + bts_index -= bts_end; + + error = ds_access_bts(child, bts_index, &bts_record); + if (error < 0) + return error; - retval = ds_read_bts((void *)child->thread.ds_area_msr, - bts_index, &ret); - if (retval < 0) - return retval; + ptrace_bts_translate_record(&ret, bts_record); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; @@ -601,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_clear(struct task_struct *child) -{ - if (!child->thread.ds_area_msr) - return -ENXIO; - - return ds_clear((void *)child->thread.ds_area_msr); -} - static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - int end, i; - void *ds = (void *)child->thread.ds_area_msr; - - if (!ds) - return -ENXIO; + struct bts_struct ret; + const unsigned char *raw; + size_t end, i; + int error; - end = ds_get_bts_index(ds); - if (end <= 0) - return end; + error = ds_get_bts_index(child, &end); + if (error < 0) + return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - for (i = 0; i < end; i++, out++) { - struct bts_struct ret; - int retval; + error = ds_access_bts(child, 0, (const void **)&raw); + if (error < 0) + return error; - retval = ds_read_bts(ds, i, &ret); - if (retval < 0) - return retval; + for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { + ptrace_bts_translate_record(&ret, raw); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; } - ds_clear(ds); + error = ds_clear_bts(child); + if (error < 0) + return error; return end; } +static void ptrace_bts_ovfl(struct task_struct *child) +{ + send_sig(child->thread.bts_ovfl_signal, child, 0); +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int bts_size, ret = 0; - void *ds; + int error = 0; + + error = -EOPNOTSUPP; + if (!bts_cfg.sizeof_bts) + goto errout; + error = -EIO; if (cfg_size < sizeof(cfg)) - return -EIO; + goto errout; + error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - return -EFAULT; + goto errout; - if ((int)cfg.size < 0) - return -EINVAL; + error = -EINVAL; + if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && + !(cfg.flags & PTRACE_BTS_O_ALLOC)) + goto errout; - bts_size = 0; - ds = (void *)child->thread.ds_area_msr; - if (ds) { - bts_size = ds_get_bts_size(ds); - if (bts_size < 0) - return bts_size; - } - cfg.size = PAGE_ALIGN(cfg.size); + if (cfg.flags & PTRACE_BTS_O_ALLOC) { + ds_ovfl_callback_t ovfl = NULL; + unsigned int sig = 0; + + /* we ignore the error in case we were not tracing child */ + (void)ds_release_bts(child); + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + goto errout; + + sig = cfg.signal; + ovfl = ptrace_bts_ovfl; + } - if (bts_size != cfg.size) { - ret = ptrace_bts_realloc(child, cfg.size, - cfg.flags & PTRACE_BTS_O_CUT_SIZE); - if (ret < 0) + error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); + if (error < 0) goto errout; - ds = (void *)child->thread.ds_area_msr; + child->thread.bts_ovfl_signal = sig; } - if (cfg.flags & PTRACE_BTS_O_SIGNAL) - ret = ds_set_overflow(ds, DS_O_SIGNAL); - else - ret = ds_set_overflow(ds, DS_O_WRAP); - if (ret < 0) + error = -EINVAL; + if (!child->thread.ds_ctx && cfg.flags) goto errout; if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= ds_debugctl_mask(); + child->thread.debugctlmsr |= bts_cfg.debugctl_mask; else - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (cfg.flags & PTRACE_BTS_O_SCHED) set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); else clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - ret = sizeof(cfg); + error = sizeof(cfg); out: if (child->thread.debugctlmsr) @@ -703,10 +778,10 @@ out: else clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - return ret; + return error; errout: - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); goto out; } @@ -715,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { - void *ds = (void *)child->thread.ds_area_msr; struct ptrace_bts_config cfg; + size_t end; + const void *base, *max; + int error; if (cfg_size < sizeof(cfg)) return -EIO; - memset(&cfg, 0, sizeof(cfg)); + error = ds_get_bts_end(child, &end); + if (error < 0) + return error; - if (ds) { - cfg.size = ds_get_bts_size(ds); + error = ds_access_bts(child, /* index = */ 0, &base); + if (error < 0) + return error; - if (ds_get_overflow(ds) == DS_O_SIGNAL) - cfg.flags |= PTRACE_BTS_O_SIGNAL; + error = ds_access_bts(child, /* index = */ end, &max); + if (error < 0) + return error; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & ds_debugctl_mask()) - cfg.flags |= PTRACE_BTS_O_TRACE; + memset(&cfg, 0, sizeof(cfg)); + cfg.size = (max - base); + cfg.signal = child->thread.bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) - cfg.flags |= PTRACE_BTS_O_SCHED; - } + if (cfg.signal) + cfg.flags |= PTRACE_BTS_O_SIGNAL; - cfg.bts_size = sizeof(struct bts_struct); + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & bts_cfg.debugctl_mask) + cfg.flags |= PTRACE_BTS_O_TRACE; + + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) return -EFAULT; @@ -745,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } - static int ptrace_bts_write_record(struct task_struct *child, const struct bts_struct *in) { - int retval; + unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - if (!child->thread.ds_area_msr) - return -ENXIO; + BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; + memset(bts_record, 0, bts_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; - return sizeof(*in); -} + case BTS_BRANCH: + bts_set(bts_record, bts_from, in->variant.lbr.from_ip); + bts_set(bts_record, bts_to, in->variant.lbr.to_ip); + break; -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + bts_set(bts_record, bts_from, bts_escape); + bts_set(bts_record, bts_qual, in->qualifier); + bts_set(bts_record, bts_jiffies, in->variant.jiffies); + break; - if (size < 0) + default: return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; } - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; - } - - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; + /* The writing task will be the switched-to task on a context + * switch. It needs to write into the switched-from task's BTS + * buffer. */ + return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -840,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk, ptrace_bts_write_record(tsk, &rec); } -#endif /* X86_BTS */ + +static const struct bts_configuration bts_cfg_netburst = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<2)|(1<<3)|(1<<5) +}; + +static const struct bts_configuration bts_cfg_pentium_m = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<6)|(1<<7) +}; + +static const struct bts_configuration bts_cfg_core2 = { + .sizeof_bts = 8 * 3, + .sizeof_field = 8, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void bts_configure(const struct bts_configuration *cfg) +{ + bts_cfg = *cfg; +} + +void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { + case 0xD: + case 0xE: /* Pentium M */ + bts_configure(&bts_cfg_pentium_m); + break; + case 0xF: /* Core2 */ + case 0x1C: /* Atom */ + bts_configure(&bts_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + bts_configure(&bts_cfg_netburst); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} +#endif /* CONFIG_X86_PTRACE_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -853,15 +947,15 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - if (child->thread.ds_area_msr) { -#ifdef X86_BTS - ptrace_bts_realloc(child, 0, 0); -#endif - child->thread.debugctlmsr &= ~ds_debugctl_mask(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - } +#ifdef CONFIG_X86_PTRACE_BTS + (void)ds_release_bts(child); + + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +#endif /* CONFIG_X86_PTRACE_BTS */ } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -981,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* * These bits need more cooking - not enabled yet: */ -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS case PTRACE_BTS_CONFIG: ret = ptrace_bts_config (child, data, (struct ptrace_bts_config __user *)addr); @@ -993,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ptrace_bts_get_size(child); + ret = ds_get_bts_index(child, /* pos = */ NULL); break; case PTRACE_BTS_GET: @@ -1002,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ptrace_bts_clear(child); + ret = ds_clear_bts(child); break; case PTRACE_BTS_DRAIN: ret = ptrace_bts_drain (child, data, (struct bts_struct __user *) addr); break; -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ default: ret = ptrace_request(child, request, addr, data); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 362d4e7f2d38..9838f2539dfc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -670,6 +670,10 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_X86_64 + check_efer(); +#endif + #if defined(CONFIG_VMI) && defined(CONFIG_X86_32) /* * Must be before kernel pagetables are setup @@ -738,7 +742,6 @@ void __init setup_arch(char **cmdline_p) #else num_physpages = max_pfn; - check_efer(); /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 4445d26efd47..2a2435d3037d 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -27,6 +27,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/vdso.h> +#include <asm/syscalls.h> #include "sigframe.h" diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 1e1933892b4f..4d32487805ef 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -28,6 +28,7 @@ #include <asm/ia32_unistd.h> #include <asm/mce.h> #include <asm/syscall.h> +#include <asm/syscalls.h> #include "sigframe.h" #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b3f916..a66c93550a0d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -88,7 +88,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) #else -struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; +static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif @@ -129,7 +129,7 @@ static int boot_cpu_logical_apicid; static cpumask_t cpu_sibling_setup_map; /* Set if we find a B stepping CPU */ -int __cpuinitdata smp_b_stepping; +static int __cpuinitdata smp_b_stepping; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7066cb855a60..1884a8d12bfa 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -22,6 +22,8 @@ #include <linux/uaccess.h> #include <linux/unistd.h> +#include <asm/syscalls.h> + asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 3b360ef33817..c9288c883e20 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -16,6 +16,7 @@ #include <asm/uaccess.h> #include <asm/ia32.h> +#include <asm/syscalls.h> asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off) diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 170d43c17487..3d1be4f0fac5 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -8,12 +8,12 @@ #define __NO_STUBS #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H #include <asm/unistd_64.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H typedef void (*sys_call_ptr_t)(void); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ffe3c664afc0..bbecf8b6bf96 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -36,6 +36,7 @@ #include <asm/arch_hooks.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/timer.h> #include "do_timer.h" diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index ab6bf375a307..6bb7b8579e70 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -10,6 +10,7 @@ #include <asm/ldt.h> #include <asm/processor.h> #include <asm/proto.h> +#include <asm/syscalls.h> #include "tls.h" diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 38f566fa27d2..4eeb5cf9720d 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -46,6 +46,7 @@ #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/irq.h> +#include <asm/syscalls.h> /* * Known problems: diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 6ca515d6db54..edfb09f30479 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); + vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 0c029e8959c7..7766d36983fc 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -61,7 +61,7 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } -static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { switch (type) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0bfe2bd305eb..3da2508eb22a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -711,6 +711,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) u64 *spte; int young = 0; + /* always return old for EPT */ + if (!shadow_accessed_mask) + return 0; + spte = rmap_next(kvm, rmapp, NULL); while (spte) { int _young; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e2ee264740c7..8233b86c778c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -62,6 +62,7 @@ static int npt = 1; module_param(npt, int, S_IRUGO); static void kvm_reput_irq(struct vcpu_svm *svm); +static void svm_flush_tlb(struct kvm_vcpu *vcpu); static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { @@ -878,6 +879,10 @@ set: static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + + if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) + force_new_asid(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -1027,6 +1032,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, (u32)fault_address, (u32)(fault_address >> 32), handler); + /* + * FIXME: Tis shouldn't be necessary here, but there is a flush + * missing in the MMU code. Until we find this bug, flush the + * complete TLB here on an NPF + */ + if (npt_enabled) + svm_flush_tlb(&svm->vcpu); if (event_injection) kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a69773e3b26..7041cc52b562 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3301,8 +3301,7 @@ static int __init vmx_init(void) kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); - kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, - VMX_EPT_FAKE_DIRTY_MASK, 0ull, + kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 425a13436b3f..23e8373507ad 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -370,8 +370,6 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62) -#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 3d317836be9e..3f2cf11f201a 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -10,13 +10,15 @@ #include <asm/e820.h> #include <asm/setup.h> +#include <mach_ipi.h> + #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) #else #define DEFAULT_SEND_IPI (0) #endif -int no_broadcast=DEFAULT_SEND_IPI; +int no_broadcast = DEFAULT_SEND_IPI; /** * pre_intr_init_hook - initialisation prior to setting up interrupt vectors diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe67b42..8f92cac4e6db 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -35,6 +35,7 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm-generic/sections.h> +#include <asm/traps.h> /* * Page fault error code bits @@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) return 0; } -void do_invalid_op(struct pt_regs *, unsigned long); - static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..6b9a9358b330 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -47,6 +47,7 @@ #include <asm/paravirt.h> #include <asm/setup.h> #include <asm/cacheflush.h> +#include <asm/smp.h> unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -458,11 +459,7 @@ static void __init pagetable_init(void) { pgd_t *pgd_base = swapper_pg_dir; - paravirt_pagetable_setup_start(pgd_base); - permanent_kmaps_init(pgd_base); - - paravirt_pagetable_setup_done(pgd_base); } #ifdef CONFIG_ACPI_SLEEP diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae3..cac6da54203b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -421,7 +421,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -int __initdata early_ioremap_debug; +static int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) { @@ -547,7 +547,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) } -int __initdata early_ioremap_nested; +static int __initdata early_ioremap_nested; static int __init check_early_ioremap_leak(void) { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 0227694f7dab..8a5f1614a3d5 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -295,10 +295,12 @@ static void nmi_cpu_shutdown(void *dummy) static void nmi_shutdown(void) { - struct op_msrs *msrs = &get_cpu_var(cpu_msrs); + struct op_msrs *msrs; + nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); put_cpu_var(cpu_msrs); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..a4e201b47f64 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1324,7 +1324,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b6acc3a0af46..d67901083888 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -42,7 +42,7 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; - e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); + e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); /* * Even though this is normal, usable memory under Xen, reserve |