summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig53
-rw-r--r--arch/x86/Kconfig.cpu20
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h11
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h34
-rw-r--r--arch/x86/include/asm/apic.h13
-rw-r--r--arch/x86/include/asm/atomic.h25
-rw-r--r--arch/x86/include/asm/atomic64_32.h278
-rw-r--r--arch/x86/include/asm/atomic64_64.h25
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/ds.h302
-rw-r--r--arch/x86/include/asm/dwarf2.h12
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h10
-rw-r--r--arch/x86/include/asm/hyperv.h11
-rw-r--r--arch/x86/include/asm/hypervisor.h27
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/io.h1
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h14
-rw-r--r--arch/x86/include/asm/msr-index.h15
-rw-r--r--arch/x86/include/asm/percpu.h24
-rw-r--r--arch/x86/include/asm/perf_event.h76
-rw-r--r--arch/x86/include/asm/perf_event_p4.h794
-rw-r--r--arch/x86/include/asm/processor.h39
-rw-r--r--arch/x86/include/asm/ptrace-abi.h57
-rw-r--r--arch/x86/include/asm/ptrace.h6
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c47
-rw-r--r--arch/x86/kernel/amd_iommu.c197
-rw-r--r--arch/x86/kernel/amd_iommu_init.c6
-rw-r--r--arch/x86/kernel/apic/io_apic.c3
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c172
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c52
-rw-r--r--arch/x86/kernel/cpu/intel.c8
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c181
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c4
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c55
-rw-r--r--arch/x86/kernel/cpu/perf_event.c815
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c357
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c857
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/vmware.c38
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/hw_breakpoint.c41
-rw-r--r--arch/x86/kernel/i8253.c14
-rw-r--r--arch/x86/kernel/kprobes.c43
-rw-r--r--arch/x86/kernel/process.c29
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/ptrace.c384
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/vmx.c8
-rw-r--r--arch/x86/kvm/x86.c54
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/atomic64_32.c273
-rw-r--r--arch/x86/lib/atomic64_386_32.S174
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S224
-rw-r--r--arch/x86/lib/rwsem_64.S2
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/srat_64.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c199
-rw-r--r--arch/x86/oprofile/op_model_amd.c280
-rw-r--r--arch/x86/oprofile/op_model_p4.c52
-rw-r--r--arch/x86/oprofile/op_model_ppro.c81
-rw-r--r--arch/x86/oprofile/op_x86_model.h4
-rw-r--r--arch/x86/pci/mrst.c4
94 files changed, 5215 insertions, 4399 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902bd..7b7838874552 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,11 +53,15 @@ config X86
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
select HAVE_HW_BREAKPOINT
+ select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
select ANON_INODES
select HAVE_ARCH_KMEMCHECK
select HAVE_USER_RETURN_NOTIFIER
+config INSTRUCTION_DECODER
+ def_bool (KPROBES || PERF_EVENTS)
+
config OUTPUT_FORMAT
string
default "elf32-i386" if X86_32
@@ -197,20 +201,17 @@ config HAVE_INTEL_TXT
# Use the generic interrupt handling code in kernel/irq/:
config GENERIC_HARDIRQS
- bool
- default y
+ def_bool y
config GENERIC_HARDIRQS_NO__DO_IRQ
def_bool y
config GENERIC_IRQ_PROBE
- bool
- default y
+ def_bool y
config GENERIC_PENDING_IRQ
- bool
+ def_bool y
depends on GENERIC_HARDIRQS && SMP
- default y
config USE_GENERIC_SMP_HELPERS
def_bool y
@@ -225,14 +226,12 @@ config X86_64_SMP
depends on X86_64 && SMP
config X86_HT
- bool
+ def_bool y
depends on SMP
- default y
config X86_TRAMPOLINE
- bool
+ def_bool y
depends on SMP || (64BIT && ACPI_SLEEP)
- default y
config X86_32_LAZY_GS
def_bool y
@@ -447,7 +446,7 @@ config X86_NUMAQ
firmware with - send email to <Martin.Bligh@us.ibm.com>.
config X86_SUPPORTS_MEMORY_FAILURE
- bool
+ def_bool y
# MCE code calls memory_failure():
depends on X86_MCE
# On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
@@ -455,7 +454,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
# On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
depends on X86_64 || !SPARSEMEM
select ARCH_SUPPORTS_MEMORY_FAILURE
- default y
config X86_VISWS
bool "SGI 320/540 (Visual Workstation)"
@@ -570,7 +568,6 @@ config PARAVIRT_SPINLOCKS
config PARAVIRT_CLOCK
bool
- default n
endif
@@ -749,7 +746,6 @@ config MAXSMP
bool "Configure Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
select CPUMASK_OFFSTACK
- default n
---help---
Configure maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
@@ -829,7 +825,6 @@ config X86_VISWS_APIC
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
- default n
depends on X86_IO_APIC
---help---
This option enables a workaround that fixes a source of
@@ -876,9 +871,8 @@ config X86_MCE_AMD
the DRAM Error Threshold.
config X86_ANCIENT_MCE
- def_bool n
+ bool "Support for old Pentium 5 / WinChip machine checks"
depends on X86_32 && X86_MCE
- prompt "Support for old Pentium 5 / WinChip machine checks"
---help---
Include support for machine check handling on old Pentium 5 or WinChip
systems. These typically need to be enabled explicitely on the command
@@ -886,8 +880,7 @@ config X86_ANCIENT_MCE
config X86_MCE_THRESHOLD
depends on X86_MCE_AMD || X86_MCE_INTEL
- bool
- default y
+ def_bool y
config X86_MCE_INJECT
depends on X86_MCE
@@ -1026,8 +1019,8 @@ config X86_CPUID
choice
prompt "High Memory Support"
- default HIGHMEM4G if !X86_NUMAQ
default HIGHMEM64G if X86_NUMAQ
+ default HIGHMEM4G
depends on X86_32
config NOHIGHMEM
@@ -1285,7 +1278,7 @@ source "mm/Kconfig"
config HIGHPTE
bool "Allocate 3rd-level pagetables from highmem"
- depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
+ depends on HIGHMEM
---help---
The VM uses one page table entry for each page of physical memory.
For systems with a lot of RAM, this can be wasteful of precious
@@ -1369,8 +1362,7 @@ config MATH_EMULATION
kernel, it won't hurt.
config MTRR
- bool
- default y
+ def_bool y
prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
---help---
On Intel P6 family processors (Pentium Pro, Pentium II and later)
@@ -1436,8 +1428,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
mtrr_spare_reg_nr=N on the kernel command line.
config X86_PAT
- bool
- default y
+ def_bool y
prompt "x86 PAT support" if EMBEDDED
depends on MTRR
---help---
@@ -1605,8 +1596,7 @@ config X86_NEED_RELOCS
depends on X86_32 && RELOCATABLE
config PHYSICAL_ALIGN
- hex
- prompt "Alignment value to which kernel should be aligned" if X86_32
+ hex "Alignment value to which kernel should be aligned" if X86_32
default "0x1000000"
range 0x2000 0x1000000
---help---
@@ -1653,7 +1643,6 @@ config COMPAT_VDSO
config CMDLINE_BOOL
bool "Built-in kernel command line"
- default n
---help---
Allow for specifying boot arguments to the kernel at
build time. On some systems (e.g. embedded ones), it is
@@ -1687,7 +1676,6 @@ config CMDLINE
config CMDLINE_OVERRIDE
bool "Built-in command line overrides boot loader arguments"
- default n
depends on CMDLINE_BOOL
---help---
Set this option to 'Y' to have the kernel ignore the boot loader
@@ -1723,8 +1711,7 @@ source "drivers/acpi/Kconfig"
source "drivers/sfi/Kconfig"
config X86_APM_BOOT
- bool
- default y
+ def_bool y
depends on APM || APM_MODULE
menuconfig APM
@@ -1953,8 +1940,7 @@ config DMAR_DEFAULT_ON
experimental.
config DMAR_BROKEN_GFX_WA
- def_bool n
- prompt "Workaround broken graphics drivers (going away soon)"
+ bool "Workaround broken graphics drivers (going away soon)"
depends on DMAR && BROKEN
---help---
Current Graphics drivers tend to use physical address
@@ -2052,7 +2038,6 @@ config SCx200HR_TIMER
config OLPC
bool "One Laptop Per Child support"
select GPIOLIB
- default n
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6f6792c56015..2ac9069890cd 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,23 +506,3 @@ config CPU_SUP_UMC_32
CPU might render the kernel unbootable.
If unsure, say N.
-
-config X86_DS
- def_bool X86_PTRACE_BTS
- depends on X86_DEBUGCTLMSR
- select HAVE_HW_BRANCH_TRACER
-
-config X86_PTRACE_BTS
- bool "Branch Trace Store"
- default y
- depends on X86_DEBUGCTLMSR
- depends on BROKEN
- ---help---
- This adds a ptrace interface to the hardware's branch trace store.
-
- Debuggers may use it to collect an execution trace of the debugged
- application in order to answer the question 'how did I get here?'.
- Debuggers may trace user mode as well as kernel mode.
-
- Say Y unless there is no application development on this machine
- and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb2..75085080b63e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -45,7 +45,6 @@ config EARLY_PRINTK
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
- default n
depends on EARLY_PRINTK && PCI
---help---
Write kernel log output directly into the EHCI debug port.
@@ -76,7 +75,6 @@ config DEBUG_PER_CPU_MAPS
bool "Debug access to per_cpu maps"
depends on DEBUG_KERNEL
depends on SMP
- default n
---help---
Say Y to verify that the per_cpu map being accessed has
been setup. Adds a fair amount of code to kernel memory
@@ -174,15 +172,6 @@ config IOMMU_LEAK
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
-config X86_DS_SELFTEST
- bool "DS selftest"
- default y
- depends on DEBUG_KERNEL
- depends on X86_DS
- ---help---
- Perform Debug Store selftests at boot time.
- If in doubt, say "N".
-
config HAVE_MMIOTRACE_SUPPORT
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0a43dc515e4c..8aa1b59b9074 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -95,8 +95,9 @@ sp-$(CONFIG_X86_64) := rsp
cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
# is .cfi_signal_frame supported too?
cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
+cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d5..a63a68be1cce 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
.macro LOCK_PREFIX
1: lock
.section .smp_locks,"a"
- _ASM_ALIGN
- _ASM_PTR 1b
+ .balign 4
+ .long 1b - .
.previous
.endm
#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b3..92a9033c14d1 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -28,14 +28,17 @@
*/
#ifdef CONFIG_SMP
-#define LOCK_PREFIX \
+#define LOCK_PREFIX_HERE \
".section .smp_locks,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661f\n" /* address */ \
+ ".balign 4\n" \
+ ".long 671f - .\n" /* offset */ \
".previous\n" \
- "661:\n\tlock; "
+ "671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
#else /* ! CONFIG_SMP */
+#define LOCK_PREFIX_HERE ""
#define LOCK_PREFIX ""
#endif
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 86a0ff0aeac7..7014e88bc779 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -174,6 +174,40 @@
(~((1ULL << (12 + ((lvl) * 9))) - 1)))
#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+ ((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+ (1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+ ((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address an a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize) \
+ (((address) | ((pagesize) - 1)) & \
+ (~(pagesize >> 1)) & PM_ADDR_MASK)
+
+/*
+ * Takes a PTE value with mode=0x07 and returns the page size it maps
+ */
+#define PTE_PAGE_SIZE(pte) \
+ (1ULL << (1 + ffz(((pte) | 0xfffULL))))
+
#define IOMMU_PTE_P (1ULL << 0)
#define IOMMU_PTE_TV (1ULL << 1)
#define IOMMU_PTE_U (1ULL << 59)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b4ac2cdcb64f..1fa03e04ae44 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -373,6 +373,7 @@ extern atomic_t init_deasserted;
extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
#endif
+#ifdef CONFIG_X86_LOCAL_APIC
static inline u32 apic_read(u32 reg)
{
return apic->read(reg);
@@ -403,10 +404,19 @@ static inline u32 safe_apic_wait_icr_idle(void)
return apic->safe_wait_icr_idle();
}
+#else /* CONFIG_X86_LOCAL_APIC */
+
+static inline u32 apic_read(u32 reg) { return 0; }
+static inline void apic_write(u32 reg, u32 val) { }
+static inline u64 apic_icr_read(void) { return 0; }
+static inline void apic_icr_write(u32 low, u32 high) { }
+static inline void apic_wait_icr_idle(void) { }
+static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+
+#endif /* CONFIG_X86_LOCAL_APIC */
static inline void ack_APIC_irq(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* ack_APIC_irq() actually gets compiled as a single instruction
* ... yummie.
@@ -414,7 +424,6 @@ static inline void ack_APIC_irq(void)
/* Docs say use 0 for future compatibility */
apic_write(APIC_EOI, 0);
-#endif
}
static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bdac..952a826ac4e5 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
*/
static inline int atomic_read(const atomic_t *v)
{
- return v->counter;
+ return (*(volatile int *)&(v)->counter);
}
/**
@@ -246,6 +246,29 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+/*
+ * atomic_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline int atomic_dec_if_positive(atomic_t *v)
+{
+ int c, old, dec;
+ c = atomic_read(v);
+ for (;;) {
+ dec = c - 1;
+ if (unlikely(dec < 0))
+ break;
+ old = atomic_cmpxchg((v), c, dec);
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return dec;
+}
+
/**
* atomic_inc_short - increment of a short integer
* @v: pointer to type int
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 03027bf28de5..2a934aa19a43 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,109 +14,193 @@ typedef struct {
#define ATOMIC64_INIT(val) { (val) }
-extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
+#ifdef CONFIG_X86_CMPXCHG64
+#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8"
+#else
+#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8)
+#endif
+
+#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f)
+
+/**
+ * atomic64_cmpxchg - cmpxchg atomic64 variable
+ * @p: pointer to type atomic64_t
+ * @o: expected value
+ * @n: new value
+ *
+ * Atomically sets @v to @n if it was equal to @o and returns
+ * the old value.
+ */
+
+static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+ return cmpxchg64(&v->counter, o, n);
+}
/**
* atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
*
- * Atomically xchgs the value of @ptr to @new_val and returns
+ * Atomically xchgs the value of @v to @n and returns
* the old value.
*/
-extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
+static inline long long atomic64_xchg(atomic64_t *v, long long n)
+{
+ long long o;
+ unsigned high = (unsigned)(n >> 32);
+ unsigned low = (unsigned)n;
+ asm volatile(ATOMIC64_ALTERNATIVE(xchg)
+ : "=A" (o), "+b" (low), "+c" (high)
+ : "S" (v)
+ : "memory"
+ );
+ return o;
+}
/**
* atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
*
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically sets the value of @v to @n.
*/
-extern void atomic64_set(atomic64_t *ptr, u64 new_val);
+static inline void atomic64_set(atomic64_t *v, long long i)
+{
+ unsigned high = (unsigned)(i >> 32);
+ unsigned low = (unsigned)i;
+ asm volatile(ATOMIC64_ALTERNATIVE(set)
+ : "+b" (low), "+c" (high)
+ : "S" (v)
+ : "eax", "edx", "memory"
+ );
+}
/**
* atomic64_read - read atomic64 variable
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
*
- * Atomically reads the value of @ptr and returns it.
+ * Atomically reads the value of @v and returns it.
*/
-static inline u64 atomic64_read(atomic64_t *ptr)
+static inline long long atomic64_read(atomic64_t *v)
{
- u64 res;
-
- /*
- * Note, we inline this atomic64_t primitive because
- * it only clobbers EAX/EDX and leaves the others
- * untouched. We also (somewhat subtly) rely on the
- * fact that cmpxchg8b returns the current 64-bit value
- * of the memory location we are touching:
- */
- asm volatile(
- "mov %%ebx, %%eax\n\t"
- "mov %%ecx, %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "=&A" (res)
- : "m" (*ptr)
- );
-
- return res;
-}
-
-extern u64 atomic64_read(atomic64_t *ptr);
+ long long r;
+ asm volatile(ATOMIC64_ALTERNATIVE(read)
+ : "=A" (r), "+c" (v)
+ : : "memory"
+ );
+ return r;
+ }
/**
* atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
*
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ * Atomically adds @i to @v and returns @i + *@v
*/
-extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add_return(long long i, atomic64_t *v)
+{
+ asm volatile(ATOMIC64_ALTERNATIVE(add_return)
+ : "+A" (i), "+c" (v)
+ : : "memory"
+ );
+ return i;
+}
/*
* Other variants with different arithmetic operators:
*/
-extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
-extern u64 atomic64_inc_return(atomic64_t *ptr);
-extern u64 atomic64_dec_return(atomic64_t *ptr);
+static inline long long atomic64_sub_return(long long i, atomic64_t *v)
+{
+ asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
+ : "+A" (i), "+c" (v)
+ : : "memory"
+ );
+ return i;
+}
+
+static inline long long atomic64_inc_return(atomic64_t *v)
+{
+ long long a;
+ asm volatile(ATOMIC64_ALTERNATIVE(inc_return)
+ : "=A" (a)
+ : "S" (v)
+ : "memory", "ecx"
+ );
+ return a;
+}
+
+static inline long long atomic64_dec_return(atomic64_t *v)
+{
+ long long a;
+ asm volatile(ATOMIC64_ALTERNATIVE(dec_return)
+ : "=A" (a)
+ : "S" (v)
+ : "memory", "ecx"
+ );
+ return a;
+}
/**
* atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
*
- * Atomically adds @delta to @ptr.
+ * Atomically adds @i to @v.
*/
-extern void atomic64_add(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add(long long i, atomic64_t *v)
+{
+ asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return)
+ : "+A" (i), "+c" (v)
+ : : "memory"
+ );
+ return i;
+}
/**
* atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
*
- * Atomically subtracts @delta from @ptr.
+ * Atomically subtracts @i from @v.
*/
-extern void atomic64_sub(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_sub(long long i, atomic64_t *v)
+{
+ asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return)
+ : "+A" (i), "+c" (v)
+ : : "memory"
+ );
+ return i;
+}
/**
* atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v and returns
* true if the result is zero, or false for all
* other cases.
*/
-extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
+static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
+{
+ return atomic64_sub_return(i, v) == 0;
+}
/**
* atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
*
- * Atomically increments @ptr by 1.
+ * Atomically increments @v by 1.
*/
-extern void atomic64_inc(atomic64_t *ptr);
+static inline void atomic64_inc(atomic64_t *v)
+{
+ asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return)
+ : : "S" (v)
+ : "memory", "eax", "ecx", "edx"
+ );
+}
/**
* atomic64_dec - decrement atomic64 variable
@@ -124,37 +208,97 @@ extern void atomic64_inc(atomic64_t *ptr);
*
* Atomically decrements @ptr by 1.
*/
-extern void atomic64_dec(atomic64_t *ptr);
+static inline void atomic64_dec(atomic64_t *v)
+{
+ asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return)
+ : : "S" (v)
+ : "memory", "eax", "ecx", "edx"
+ );
+}
/**
* atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
*
- * Atomically decrements @ptr by 1 and
+ * Atomically decrements @v by 1 and
* returns true if the result is 0, or false for all other
* cases.
*/
-extern int atomic64_dec_and_test(atomic64_t *ptr);
+static inline int atomic64_dec_and_test(atomic64_t *v)
+{
+ return atomic64_dec_return(v) == 0;
+}
/**
* atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
*
- * Atomically increments @ptr by 1
+ * Atomically increments @v by 1
* and returns true if the result is zero, or false for all
* other cases.
*/
-extern int atomic64_inc_and_test(atomic64_t *ptr);
+static inline int atomic64_inc_and_test(atomic64_t *v)
+{
+ return atomic64_inc_return(v) == 0;
+}
/**
* atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
*
- * Atomically adds @delta to @ptr and returns true
+ * Atomically adds @i to @v and returns true
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
+static inline int atomic64_add_negative(long long i, atomic64_t *v)
+{
+ return atomic64_add_return(i, v) < 0;
+}
+
+/**
+ * atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+ unsigned low = (unsigned)u;
+ unsigned high = (unsigned)(u >> 32);
+ asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
+ : "+A" (a), "+c" (v), "+S" (low), "+D" (high)
+ : : "memory");
+ return (int)a;
+}
+
+
+static inline int atomic64_inc_not_zero(atomic64_t *v)
+{
+ int r;
+ asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero)
+ : "=a" (r)
+ : "S" (v)
+ : "ecx", "edx", "memory"
+ );
+ return r;
+}
+
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
+{
+ long long r;
+ asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive)
+ : "=A" (r)
+ : "S" (v)
+ : "ecx", "memory"
+ );
+ return r;
+}
+
+#undef ATOMIC64_ALTERNATIVE
+#undef ATOMIC64_ALTERNATIVE_
#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b4056929..49fd1ea22951 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
*/
static inline long atomic64_read(const atomic64_t *v)
{
- return v->counter;
+ return (*(volatile long *)&(v)->counter);
}
/**
@@ -221,4 +221,27 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
+/*
+ * atomic64_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline long atomic64_dec_if_positive(atomic64_t *v)
+{
+ long c, old, dec;
+ c = atomic64_read(v);
+ for (;;) {
+ dec = c - 1;
+ if (unlikely(dec < 0))
+ break;
+ old = atomic64_cmpxchg((v), c, dec);
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return dec;
+}
+
#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 7a1065958ba9..3b62ab56c7a0 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -24,7 +24,7 @@
#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
- (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
#endif
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ffb9bb6b6c37..8859e12dd3cf 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -271,7 +271,8 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
__typeof__(*(ptr)) __ret; \
__typeof__(*(ptr)) __old = (o); \
__typeof__(*(ptr)) __new = (n); \
- alternative_io("call cmpxchg8b_emu", \
+ alternative_io(LOCK_PREFIX_HERE \
+ "call cmpxchg8b_emu", \
"lock; cmpxchg8b (%%esi)" , \
X86_FEATURE_CX8, \
"=A" (__ret), \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9b11a5cc6662..dca9c545f44e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -161,6 +161,7 @@
*/
#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
/* Virtualization flags: Linux defined */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b093..000000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Debug Store (DS) support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#ifndef _ASM_X86_DS_H
-#define _ASM_X86_DS_H
-
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/err.h>
-
-
-#ifdef CONFIG_X86_DS
-
-struct task_struct;
-struct ds_context;
-struct ds_tracer;
-struct bts_tracer;
-struct pebs_tracer;
-
-typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
-typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
-
-
-/*
- * A list of features plus corresponding macros to talk about them in
- * the ds_request function's flags parameter.
- *
- * We use the enum to index an array of corresponding control bits;
- * we use the macro to index a flags bit-vector.
- */
-enum ds_feature {
- dsf_bts = 0,
- dsf_bts_kernel,
-#define BTS_KERNEL (1 << dsf_bts_kernel)
- /* trace kernel-mode branches */
-
- dsf_bts_user,
-#define BTS_USER (1 << dsf_bts_user)
- /* trace user-mode branches */
-
- dsf_bts_overflow,
- dsf_bts_max,
- dsf_pebs = dsf_bts_max,
-
- dsf_pebs_max,
- dsf_ctl_max = dsf_pebs_max,
- dsf_bts_timestamps = dsf_ctl_max,
-#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
- /* add timestamps into BTS trace */
-
-#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
-};
-
-
-/*
- * Request BTS or PEBS
- *
- * Due to alignement constraints, the actual buffer may be slightly
- * smaller than the requested or provided buffer.
- *
- * Returns a pointer to a tracer structure on success, or
- * ERR_PTR(errcode) on failure.
- *
- * The interrupt threshold is independent from the overflow callback
- * to allow users to use their own overflow interrupt handling mechanism.
- *
- * The function might sleep.
- *
- * task: the task to request recording for
- * cpu: the cpu to request recording for
- * base: the base pointer for the (non-pageable) buffer;
- * size: the size of the provided buffer in bytes
- * ovfl: pointer to a function to be called on buffer overflow;
- * NULL if cyclic buffer requested
- * th: the interrupt threshold in records from the end of the buffer;
- * -1 if no interrupt threshold is requested.
- * flags: a bit-mask of the above flags
- */
-extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Must be called with irq's enabled.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern void ds_release_bts(struct bts_tracer *tracer);
-extern void ds_suspend_bts(struct bts_tracer *tracer);
-extern void ds_resume_bts(struct bts_tracer *tracer);
-extern void ds_release_pebs(struct pebs_tracer *tracer);
-extern void ds_suspend_pebs(struct pebs_tracer *tracer);
-extern void ds_resume_pebs(struct pebs_tracer *tracer);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Cpu tracers must call this on the traced cpu.
- * Task tracers must call ds_release_~_noirq() for themselves.
- *
- * May be called with irq's disabled.
- *
- * Returns 0 if successful;
- * -EPERM if the cpu tracer does not trace the current cpu.
- * -EPERM if the task tracer does not trace itself.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_release_bts_noirq(struct bts_tracer *tracer);
-extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
-extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
-extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
-
-
-/*
- * The raw DS buffer state as it is used for BTS and PEBS recording.
- *
- * This is the low-level, arch-dependent interface for working
- * directly on the raw trace data.
- */
-struct ds_trace {
- /* the number of bts/pebs records */
- size_t n;
- /* the size of a bts/pebs record in bytes */
- size_t size;
- /* pointers into the raw buffer:
- - to the first entry */
- void *begin;
- /* - one beyond the last entry */
- void *end;
- /* - one beyond the newest entry */
- void *top;
- /* - the interrupt threshold */
- void *ith;
- /* flags given on ds_request() */
- unsigned int flags;
-};
-
-/*
- * An arch-independent view on branch trace data.
- */
-enum bts_qualifier {
- bts_invalid,
-#define BTS_INVALID bts_invalid
-
- bts_branch,
-#define BTS_BRANCH bts_branch
-
- bts_task_arrives,
-#define BTS_TASK_ARRIVES bts_task_arrives
-
- bts_task_departs,
-#define BTS_TASK_DEPARTS bts_task_departs
-
- bts_qual_bit_size = 4,
- bts_qual_max = (1 << bts_qual_bit_size),
-};
-
-struct bts_struct {
- __u64 qualifier;
- union {
- /* BTS_BRANCH */
- struct {
- __u64 from;
- __u64 to;
- } lbr;
- /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
- struct {
- __u64 clock;
- pid_t pid;
- } event;
- } variant;
-};
-
-
-/*
- * The BTS state.
- *
- * This gives access to the raw DS state and adds functions to provide
- * an arch-independent view of the BTS data.
- */
-struct bts_trace {
- struct ds_trace ds;
-
- int (*read)(struct bts_tracer *tracer, const void *at,
- struct bts_struct *out);
- int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
-};
-
-
-/*
- * The PEBS state.
- *
- * This gives access to the raw DS state and the PEBS-specific counter
- * reset value.
- */
-struct pebs_trace {
- struct ds_trace ds;
-
- /* the number of valid counters in the below array */
- unsigned int counters;
-
-#define MAX_PEBS_COUNTERS 4
- /* the counter reset value */
- unsigned long long counter_reset[MAX_PEBS_COUNTERS];
-};
-
-
-/*
- * Read the BTS or PEBS trace.
- *
- * Returns a view on the trace collected for the parameter tracer.
- *
- * The view remains valid as long as the traced task is not running or
- * the tracer is suspended.
- * Writes into the trace buffer are not reflected.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
-extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
-
-
-/*
- * Reset the write pointer of the BTS/PEBS buffer.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_reset_bts(struct bts_tracer *tracer);
-extern int ds_reset_pebs(struct pebs_tracer *tracer);
-
-/*
- * Set the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * counter: the index of the counter
- * value: the new counter reset value
- */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
- unsigned int counter, u64 value);
-
-/*
- * Initialization
- */
-struct cpuinfo_x86;
-extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-/*
- * Context switch work
- */
-extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-
-#else /* CONFIG_X86_DS */
-
-struct cpuinfo_x86;
-static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
-static inline void ds_switch_to(struct task_struct *prev,
- struct task_struct *next) {}
-
-#endif /* CONFIG_X86_DS */
-#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index ae6253ab9029..733f7e91e7a9 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -34,6 +34,18 @@
#define CFI_SIGNAL_FRAME
#endif
+#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+ /*
+ * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+ * The latter we currently just discard since we don't do DWARF
+ * unwinding at runtime. So only the offline DWARF information is
+ * useful to anyone. Note we should not use this directive if this
+ * file is used in the vDSO assembly, or if vmlinux.lds.S gets
+ * changed so it doesn't discard .eh_frame.
+ */
+ .cfi_sections .debug_frame
+#endif
+
#else
/*
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cfe..aeab29aee617 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#define __ARCH_IRQ_STAT
-#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
+#define inc_irq_stat(member) percpu_inc(irq_stat.member)
#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23a..942255310e6a 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -41,12 +41,16 @@ struct arch_hw_breakpoint {
/* Total number of available HW breakpoint registers */
#define HBP_NUM 4
+static inline int hw_breakpoint_slots(int type)
+{
+ return HBP_NUM;
+}
+
struct perf_event;
struct pmu;
-extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
- struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
unsigned long val, void *data);
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index e153a2b3889a..5df477ac3af7 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_KVM_HYPERV_H
-#define _ASM_X86_KVM_HYPERV_H
+#ifndef _ASM_X86_HYPERV_H
+#define _ASM_X86_HYPERV_H
#include <linux/types.h>
@@ -14,6 +14,10 @@
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
+#define HYPERV_CPUID_MIN 0x40000005
+#define HYPERV_CPUID_MAX 0x4000ffff
+
/*
* Feature identification. EAX indicates which features are available
* to the partition based upon the current partition privileges.
@@ -129,6 +133,9 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+
/* Define the virtual APIC registers */
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b78c0941e422..70abda7058c8 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,33 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
-#ifndef ASM_X86__HYPERVISOR_H
-#define ASM_X86__HYPERVISOR_H
+#ifndef _ASM_X86_HYPERVISOR_H
+#define _ASM_X86_HYPERVISOR_H
extern void init_hypervisor(struct cpuinfo_x86 *c);
extern void init_hypervisor_platform(void);
+/*
+ * x86 hypervisor information
+ */
+struct hypervisor_x86 {
+ /* Hypervisor name */
+ const char *name;
+
+ /* Detection routine */
+ bool (*detect)(void);
+
+ /* Adjust CPU feature bits (run once per CPU) */
+ void (*set_cpu_features)(struct cpuinfo_x86 *);
+
+ /* Platform setup (run once per boot) */
+ void (*init_platform)(void);
+};
+
+extern const struct hypervisor_x86 *x86_hyper;
+
+/* Recognized hypervisors */
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+
#endif
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 1edbf89680fd..fc1f579fb965 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,7 +6,7 @@
#define PIT_CH0 0x40
#define PIT_CH2 0x42
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 96c2e0ad04ca..88c765e16410 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -68,6 +68,8 @@ struct insn {
const insn_byte_t *next_byte;
};
+#define MAX_INSN_SIZE 16
+
#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index a1dcfa3ab17d..30a3e9776123 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -347,6 +347,7 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr,
extern void __iomem *early_memremap(resource_size_t phys_addr,
unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern void fixup_early_ioremap(void);
#define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe8..af00bd1d2089 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int k8_scan_nodes(void);
#ifdef CONFIG_K8_NB
+extern int num_k8_northbridges;
+
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
}
+
#else
+#define num_k8_northbridges 0
+
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
return NULL;
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4ffa345a8ccb..547882539157 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -24,6 +24,7 @@
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/percpu.h>
+#include <asm/insn.h>
#define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -36,7 +37,6 @@ typedef u8 kprobe_opcode_t;
#define RELATIVEJUMP_SIZE 5
#define RELATIVECALL_OPCODE 0xe8
#define RELATIVE_ADDR_SIZE 4
-#define MAX_INSN_SIZE 16
#define MAX_STACK_SIZE 64
#define MIN_STACK_SIZE(ADDR) \
(((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 000000000000..79ce5685ab64
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_MSHYPER_H
+#define _ASM_X86_MSHYPER_H
+
+#include <linux/types.h>
+#include <asm/hyperv.h>
+
+struct ms_hyperv_info {
+ u32 features;
+ u32 hints;
+};
+
+extern struct ms_hyperv_info ms_hyperv;
+
+#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4604e6a54d36..bc473acfa7f9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -71,11 +71,14 @@
#define MSR_IA32_LASTINTTOIP 0x000001de
/* DEBUGCTLMSR bits (others vary by model): */
-#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
-#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
-
-#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
-#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
+#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
+#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
+#define DEBUGCTLMSR_TR (1UL << 6)
+#define DEBUGCTLMSR_BTS (1UL << 7)
+#define DEBUGCTLMSR_BTINT (1UL << 8)
+#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
+#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
#define MSR_IA32_MC0_CTL 0x00000400
#define MSR_IA32_MC0_STATUS 0x00000401
@@ -359,6 +362,8 @@
#define MSR_P4_U2L_ESCR0 0x000003b0
#define MSR_P4_U2L_ESCR1 0x000003b1
+#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
+
/* Intel Core-based CPU performance counters */
#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b8..0ec6d12d84e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -190,6 +190,29 @@ do { \
pfo_ret__; \
})
+#define percpu_unary_op(op, var) \
+({ \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+})
+
/*
* percpu_read() makes gcc load the percpu variable every time it is
* accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do { \
#define percpu_and(var, val) percpu_to_op("and", var, val)
#define percpu_or(var, val) percpu_to_op("or", var, val)
#define percpu_xor(var, val) percpu_to_op("xor", var, val)
+#define percpu_inc(var) percpu_unary_op("inc", var)
#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index db6109a885a7..254883d0c7e0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,7 +5,7 @@
* Performance event hw details:
*/
-#define X86_PMC_MAX_GENERIC 8
+#define X86_PMC_MAX_GENERIC 32
#define X86_PMC_MAX_FIXED 3
#define X86_PMC_IDX_GENERIC 0
@@ -18,39 +18,31 @@
#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-
-
-#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
-#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
-#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
-#define INTEL_ARCH_INV_MASK 0x00800000ULL
-#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
-#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
-
-/*
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- * - inv
- * - edge
- * - cnt-mask
- * The other filters are supported by fixed counters.
- * The any-thread option is supported starting with v3.
- */
-#define INTEL_ARCH_FIXED_MASK \
- (INTEL_ARCH_CNT_MASK| \
- INTEL_ARCH_INV_MASK| \
- INTEL_ARCH_EDGE_MASK|\
- INTEL_ARCH_UNIT_MASK|\
- INTEL_ARCH_EVENT_MASK)
+#define ARCH_PERFMON_EVENTSEL_EVENT 0x000000FFULL
+#define ARCH_PERFMON_EVENTSEL_UMASK 0x0000FF00ULL
+#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
+#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
+#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
+#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
+#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
+
+#define AMD64_EVENTSEL_EVENT \
+ (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
+#define INTEL_ARCH_EVENT_MASK \
+ (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+
+#define X86_RAW_EVENT_MASK \
+ (ARCH_PERFMON_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK | \
+ ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK)
+#define AMD64_RAW_EVENT_MASK \
+ (X86_RAW_EVENT_MASK | \
+ AMD64_EVENTSEL_EVENT)
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -67,7 +59,7 @@
union cpuid10_eax {
struct {
unsigned int version_id:8;
- unsigned int num_events:8;
+ unsigned int num_counters:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split;
@@ -76,7 +68,7 @@ union cpuid10_eax {
union cpuid10_edx {
struct {
- unsigned int num_events_fixed:4;
+ unsigned int num_counters_fixed:4;
unsigned int reserved:28;
} split;
unsigned int full;
@@ -136,6 +128,18 @@ extern void perf_events_lapic_init(void);
#define PERF_EVENT_INDEX_OFFSET 0
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT (1UL << 3)
+
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs) perf_misc_flags(regs)
+
#else
static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 000000000000..b05400a542ff
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,794 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ */
+
+#ifndef PERF_EVENT_P4_H
+#define PERF_EVENT_P4_H
+
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+/*
+ * NetBurst has perfomance MSRs shared between
+ * threads if HT is turned on, ie for both logical
+ * processors (mem: in turn in Atom with HT support
+ * perf-MSRs are not shared and every thread has its
+ * own perf-MSRs set)
+ */
+#define ARCH_P4_TOTAL_ESCR (46)
+#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR (18)
+#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
+
+#define P4_ESCR_EVENT_MASK 0x7e000000U
+#define P4_ESCR_EVENT_SHIFT 25
+#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
+#define P4_ESCR_EVENTMASK_SHIFT 9
+#define P4_ESCR_TAG_MASK 0x000001e0U
+#define P4_ESCR_TAG_SHIFT 5
+#define P4_ESCR_TAG_ENABLE 0x00000010U
+#define P4_ESCR_T0_OS 0x00000008U
+#define P4_ESCR_T0_USR 0x00000004U
+#define P4_ESCR_T1_OS 0x00000002U
+#define P4_ESCR_T1_USR 0x00000001U
+
+#define P4_ESCR_EVENT(v) ((v) << P4_ESCR_EVENT_SHIFT)
+#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
+
+/* Non HT mask */
+#define P4_ESCR_MASK \
+ (P4_ESCR_EVENT_MASK | \
+ P4_ESCR_EVENTMASK_MASK | \
+ P4_ESCR_TAG_MASK | \
+ P4_ESCR_TAG_ENABLE | \
+ P4_ESCR_T0_OS | \
+ P4_ESCR_T0_USR)
+
+/* HT mask */
+#define P4_ESCR_MASK_HT \
+ (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
+
+#define P4_CCCR_OVF 0x80000000U
+#define P4_CCCR_CASCADE 0x40000000U
+#define P4_CCCR_OVF_PMI_T0 0x04000000U
+#define P4_CCCR_OVF_PMI_T1 0x08000000U
+#define P4_CCCR_FORCE_OVF 0x02000000U
+#define P4_CCCR_EDGE 0x01000000U
+#define P4_CCCR_THRESHOLD_MASK 0x00f00000U
+#define P4_CCCR_THRESHOLD_SHIFT 20
+#define P4_CCCR_COMPLEMENT 0x00080000U
+#define P4_CCCR_COMPARE 0x00040000U
+#define P4_CCCR_ESCR_SELECT_MASK 0x0000e000U
+#define P4_CCCR_ESCR_SELECT_SHIFT 13
+#define P4_CCCR_ENABLE 0x00001000U
+#define P4_CCCR_THREAD_SINGLE 0x00010000U
+#define P4_CCCR_THREAD_BOTH 0x00020000U
+#define P4_CCCR_THREAD_ANY 0x00030000U
+#define P4_CCCR_RESERVED 0x00000fffU
+
+#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+/* Custom bits in reerved CCCR area */
+#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
+
+
+/* Non HT mask */
+#define P4_CCCR_MASK \
+ (P4_CCCR_OVF | \
+ P4_CCCR_CASCADE | \
+ P4_CCCR_OVF_PMI_T0 | \
+ P4_CCCR_FORCE_OVF | \
+ P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE | \
+ P4_CCCR_ESCR_SELECT_MASK | \
+ P4_CCCR_ENABLE)
+
+/* HT mask */
+#define P4_CCCR_MASK_HT (P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
+
+#define P4_GEN_ESCR_EMASK(class, name, bit) \
+ class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_EMASK_BIT(class, name) class##__##name
+
+/*
+ * config field is 64bit width and consists of
+ * HT << 63 | ESCR << 32 | CCCR
+ * where HT is HyperThreading bit (since ESCR
+ * has it reserved we may use it for own purpose)
+ *
+ * note that this is NOT the addresses of respective
+ * ESCR and CCCR but rather an only packed value should
+ * be unpacked and written to a proper addresses
+ *
+ * the base idea is to pack as much info as
+ * possible
+ */
+#define p4_config_pack_escr(v) (((u64)(v)) << 32)
+#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_escr(v) (((u64)(v)) >> 32)
+#define p4_config_unpack_cccr(v) (((u64)(v)) & 0xffffffffULL)
+
+#define p4_config_unpack_emask(v) \
+ ({ \
+ u32 t = p4_config_unpack_escr((v)); \
+ t = t & P4_ESCR_EVENTMASK_MASK; \
+ t = t >> P4_ESCR_EVENTMASK_SHIFT; \
+ t; \
+ })
+
+#define p4_config_unpack_event(v) \
+ ({ \
+ u32 t = p4_config_unpack_escr((v)); \
+ t = t & P4_ESCR_EVENT_MASK; \
+ t = t >> P4_ESCR_EVENT_SHIFT; \
+ t; \
+ })
+
+#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
+
+#define P4_CONFIG_HT_SHIFT 63
+#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
+
+static inline bool p4_is_event_cascaded(u64 config)
+{
+ u32 cccr = p4_config_unpack_cccr(config);
+ return !!(cccr & P4_CCCR_CASCADE);
+}
+
+static inline int p4_ht_config_thread(u64 config)
+{
+ return !!(config & P4_CONFIG_HT);
+}
+
+static inline u64 p4_set_ht_bit(u64 config)
+{
+ return config | P4_CONFIG_HT;
+}
+
+static inline u64 p4_clear_ht_bit(u64 config)
+{
+ return config & ~P4_CONFIG_HT;
+}
+
+static inline int p4_ht_active(void)
+{
+#ifdef CONFIG_SMP
+ return smp_num_siblings > 1;
+#endif
+ return 0;
+}
+
+static inline int p4_ht_thread(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (smp_num_siblings == 2)
+ return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
+#endif
+ return 0;
+}
+
+static inline int p4_should_swap_ts(u64 config, int cpu)
+{
+ return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
+}
+
+static inline u32 p4_default_cccr_conf(int cpu)
+{
+ /*
+ * Note that P4_CCCR_THREAD_ANY is "required" on
+ * non-HT machines (on HT machines we count TS events
+ * regardless the state of second logical processor
+ */
+ u32 cccr = P4_CCCR_THREAD_ANY;
+
+ if (!p4_ht_thread(cpu))
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ else
+ cccr |= P4_CCCR_OVF_PMI_T1;
+
+ return cccr;
+}
+
+static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
+{
+ u32 escr = 0;
+
+ if (!p4_ht_thread(cpu)) {
+ if (!exclude_os)
+ escr |= P4_ESCR_T0_OS;
+ if (!exclude_usr)
+ escr |= P4_ESCR_T0_USR;
+ } else {
+ if (!exclude_os)
+ escr |= P4_ESCR_T1_OS;
+ if (!exclude_usr)
+ escr |= P4_ESCR_T1_USR;
+ }
+
+ return escr;
+}
+
+enum P4_EVENTS {
+ P4_EVENT_TC_DELIVER_MODE,
+ P4_EVENT_BPU_FETCH_REQUEST,
+ P4_EVENT_ITLB_REFERENCE,
+ P4_EVENT_MEMORY_CANCEL,
+ P4_EVENT_MEMORY_COMPLETE,
+ P4_EVENT_LOAD_PORT_REPLAY,
+ P4_EVENT_STORE_PORT_REPLAY,
+ P4_EVENT_MOB_LOAD_REPLAY,
+ P4_EVENT_PAGE_WALK_TYPE,
+ P4_EVENT_BSQ_CACHE_REFERENCE,
+ P4_EVENT_IOQ_ALLOCATION,
+ P4_EVENT_IOQ_ACTIVE_ENTRIES,
+ P4_EVENT_FSB_DATA_ACTIVITY,
+ P4_EVENT_BSQ_ALLOCATION,
+ P4_EVENT_BSQ_ACTIVE_ENTRIES,
+ P4_EVENT_SSE_INPUT_ASSIST,
+ P4_EVENT_PACKED_SP_UOP,
+ P4_EVENT_PACKED_DP_UOP,
+ P4_EVENT_SCALAR_SP_UOP,
+ P4_EVENT_SCALAR_DP_UOP,
+ P4_EVENT_64BIT_MMX_UOP,
+ P4_EVENT_128BIT_MMX_UOP,
+ P4_EVENT_X87_FP_UOP,
+ P4_EVENT_TC_MISC,
+ P4_EVENT_GLOBAL_POWER_EVENTS,
+ P4_EVENT_TC_MS_XFER,
+ P4_EVENT_UOP_QUEUE_WRITES,
+ P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
+ P4_EVENT_RETIRED_BRANCH_TYPE,
+ P4_EVENT_RESOURCE_STALL,
+ P4_EVENT_WC_BUFFER,
+ P4_EVENT_B2B_CYCLES,
+ P4_EVENT_BNR,
+ P4_EVENT_SNOOP,
+ P4_EVENT_RESPONSE,
+ P4_EVENT_FRONT_END_EVENT,
+ P4_EVENT_EXECUTION_EVENT,
+ P4_EVENT_REPLAY_EVENT,
+ P4_EVENT_INSTR_RETIRED,
+ P4_EVENT_UOPS_RETIRED,
+ P4_EVENT_UOP_TYPE,
+ P4_EVENT_BRANCH_RETIRED,
+ P4_EVENT_MISPRED_BRANCH_RETIRED,
+ P4_EVENT_X87_ASSIST,
+ P4_EVENT_MACHINE_CLEAR,
+ P4_EVENT_INSTR_COMPLETED,
+};
+
+#define P4_OPCODE(event) event##_OPCODE
+#define P4_OPCODE_ESEL(opcode) ((opcode & 0x00ff) >> 0)
+#define P4_OPCODE_EVNT(opcode) ((opcode & 0xff00) >> 8)
+#define P4_OPCODE_PACK(event, sel) (((event) << 8) | sel)
+
+/*
+ * Comments below the event represent ESCR restriction
+ * for this event and counter index per ESCR
+ *
+ * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
+ * processor builds (family 0FH, models 01H-02H). These MSRs
+ * are not available on later versions, so that we don't use
+ * them completely
+ *
+ * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
+ * working so that we should not use this CCCR and respective
+ * counter as result
+ */
+enum P4_EVENT_OPCODES {
+ P4_OPCODE(P4_EVENT_TC_DELIVER_MODE) = P4_OPCODE_PACK(0x01, 0x01),
+ /*
+ * MSR_P4_TC_ESCR0: 4, 5
+ * MSR_P4_TC_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST) = P4_OPCODE_PACK(0x03, 0x00),
+ /*
+ * MSR_P4_BPU_ESCR0: 0, 1
+ * MSR_P4_BPU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_ITLB_REFERENCE) = P4_OPCODE_PACK(0x18, 0x03),
+ /*
+ * MSR_P4_ITLB_ESCR0: 0, 1
+ * MSR_P4_ITLB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_MEMORY_CANCEL) = P4_OPCODE_PACK(0x02, 0x05),
+ /*
+ * MSR_P4_DAC_ESCR0: 8, 9
+ * MSR_P4_DAC_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_MEMORY_COMPLETE) = P4_OPCODE_PACK(0x08, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY) = P4_OPCODE_PACK(0x04, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY) = P4_OPCODE_PACK(0x05, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY) = P4_OPCODE_PACK(0x03, 0x02),
+ /*
+ * MSR_P4_MOB_ESCR0: 0, 1
+ * MSR_P4_MOB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE) = P4_OPCODE_PACK(0x01, 0x04),
+ /*
+ * MSR_P4_PMH_ESCR0: 0, 1
+ * MSR_P4_PMH_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE) = P4_OPCODE_PACK(0x0c, 0x07),
+ /*
+ * MSR_P4_BSU_ESCR0: 0, 1
+ * MSR_P4_BSU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_IOQ_ALLOCATION) = P4_OPCODE_PACK(0x03, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x1a, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY) = P4_OPCODE_PACK(0x17, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_ALLOCATION) = P4_OPCODE_PACK(0x05, 0x07),
+ /*
+ * MSR_P4_BSU_ESCR0: 0, 1
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x06, 0x07),
+ /*
+ * NOTE: no ESCR name in docs, it's guessed
+ * MSR_P4_BSU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST) = P4_OPCODE_PACK(0x34, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_PACKED_SP_UOP) = P4_OPCODE_PACK(0x08, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_PACKED_DP_UOP) = P4_OPCODE_PACK(0x0c, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_SCALAR_SP_UOP) = P4_OPCODE_PACK(0x0a, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_SCALAR_DP_UOP) = P4_OPCODE_PACK(0x0e, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_64BIT_MMX_UOP) = P4_OPCODE_PACK(0x02, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_128BIT_MMX_UOP) = P4_OPCODE_PACK(0x1a, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_X87_FP_UOP) = P4_OPCODE_PACK(0x04, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_TC_MISC) = P4_OPCODE_PACK(0x06, 0x01),
+ /*
+ * MSR_P4_TC_ESCR0: 4, 5
+ * MSR_P4_TC_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS) = P4_OPCODE_PACK(0x13, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_TC_MS_XFER) = P4_OPCODE_PACK(0x05, 0x00),
+ /*
+ * MSR_P4_MS_ESCR0: 4, 5
+ * MSR_P4_MS_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES) = P4_OPCODE_PACK(0x09, 0x00),
+ /*
+ * MSR_P4_MS_ESCR0: 4, 5
+ * MSR_P4_MS_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x05, 0x02),
+ /*
+ * MSR_P4_TBPU_ESCR0: 4, 5
+ * MSR_P4_TBPU_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x04, 0x02),
+ /*
+ * MSR_P4_TBPU_ESCR0: 4, 5
+ * MSR_P4_TBPU_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RESOURCE_STALL) = P4_OPCODE_PACK(0x01, 0x01),
+ /*
+ * MSR_P4_ALF_ESCR0: 12, 13, 16
+ * MSR_P4_ALF_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_WC_BUFFER) = P4_OPCODE_PACK(0x05, 0x05),
+ /*
+ * MSR_P4_DAC_ESCR0: 8, 9
+ * MSR_P4_DAC_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_B2B_CYCLES) = P4_OPCODE_PACK(0x16, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BNR) = P4_OPCODE_PACK(0x08, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_SNOOP) = P4_OPCODE_PACK(0x06, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_RESPONSE) = P4_OPCODE_PACK(0x04, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_FRONT_END_EVENT) = P4_OPCODE_PACK(0x08, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_EXECUTION_EVENT) = P4_OPCODE_PACK(0x0c, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_REPLAY_EVENT) = P4_OPCODE_PACK(0x09, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_INSTR_RETIRED) = P4_OPCODE_PACK(0x02, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_UOPS_RETIRED) = P4_OPCODE_PACK(0x01, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_UOP_TYPE) = P4_OPCODE_PACK(0x02, 0x02),
+ /*
+ * MSR_P4_RAT_ESCR0: 12, 13, 16
+ * MSR_P4_RAT_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_BRANCH_RETIRED) = P4_OPCODE_PACK(0x06, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED) = P4_OPCODE_PACK(0x03, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_X87_ASSIST) = P4_OPCODE_PACK(0x03, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_MACHINE_CLEAR) = P4_OPCODE_PACK(0x02, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_INSTR_COMPLETED) = P4_OPCODE_PACK(0x07, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+};
+
+/*
+ * a caller should use P4_ESCR_EMASK_NAME helper to
+ * pick the EventMask needed, for example
+ *
+ * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
+ */
+enum P4_ESCR_EMASKS {
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
+};
+
+/* P4 PEBS: stale for a while */
+#define P4_PEBS_METRIC_MASK 0x00001fffU
+#define P4_PEBS_UOB_TAG 0x01000000U
+#define P4_PEBS_ENABLE 0x02000000U
+
+/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
+#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001
+#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002
+#define P4_PEBS__dtlb_load_miss_retired 0x3000004
+#define P4_PEBS__dtlb_store_miss_retired 0x3000004
+#define P4_PEBS__dtlb_all_miss_retired 0x3000004
+#define P4_PEBS__tagged_mispred_branch 0x3018000
+#define P4_PEBS__mob_load_replay_retired 0x3000200
+#define P4_PEBS__split_load_retired 0x3000400
+#define P4_PEBS__split_store_retired 0x3000400
+
+#define P4_VERT__1stl_cache_load_miss_retired 0x0000001
+#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001
+#define P4_VERT__dtlb_load_miss_retired 0x0000001
+#define P4_VERT__dtlb_store_miss_retired 0x0000002
+#define P4_VERT__dtlb_all_miss_retired 0x0000003
+#define P4_VERT__tagged_mispred_branch 0x0000010
+#define P4_VERT__mob_load_replay_retired 0x0000001
+#define P4_VERT__split_load_retired 0x0000001
+#define P4_VERT__split_store_retired 0x0000002
+
+enum P4_CACHE_EVENTS {
+ P4_CACHE__NONE,
+
+ P4_CACHE__1stl_cache_load_miss_retired,
+ P4_CACHE__2ndl_cache_load_miss_retired,
+ P4_CACHE__dtlb_load_miss_retired,
+ P4_CACHE__dtlb_store_miss_retired,
+ P4_CACHE__itlb_reference_hit,
+ P4_CACHE__itlb_reference_miss,
+
+ P4_CACHE__MAX
+};
+
+#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b684f587647c..5a51379dcbe4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
-#include <asm/ds.h>
#include <linux/personality.h>
#include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/init.h>
+#include <linux/err.h>
#define HBP_NUM 4
/*
@@ -113,7 +113,6 @@ struct cpuinfo_x86 {
/* Index into per_cpu list: */
u16 cpu_index;
#endif
- unsigned int x86_hyper_vendor;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define X86_VENDOR_INTEL 0
@@ -127,9 +126,6 @@ struct cpuinfo_x86 {
#define X86_VENDOR_UNKNOWN 0xff
-#define X86_HYPER_VENDOR_NONE 0
-#define X86_HYPER_VENDOR_VMWARE 1
-
/*
* capabilities of CPUs
*/
@@ -477,10 +473,6 @@ struct thread_struct {
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
-/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
- unsigned long debugctlmsr;
- /* Debug Store context; see asm/ds.h */
- struct ds_context *ds_ctx;
};
static inline unsigned long native_get_debugreg(int regno)
@@ -807,7 +799,7 @@ extern void cpu_init(void);
static inline unsigned long get_debugctlmsr(void)
{
- unsigned long debugctlmsr = 0;
+ unsigned long debugctlmsr = 0;
#ifndef CONFIG_X86_DEBUGCTLMSR
if (boot_cpu_data.x86 < 6)
@@ -815,21 +807,6 @@ static inline unsigned long get_debugctlmsr(void)
#endif
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
- return debugctlmsr;
-}
-
-static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
-{
- u64 debugctlmsr = 0;
- u32 val1, val2;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return 0;
-#endif
- rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
- debugctlmsr = val1 | ((u64)val2 << 32);
-
return debugctlmsr;
}
@@ -842,18 +819,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}
-static inline void update_debugctlmsr_on_cpu(int cpu,
- unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return;
-#endif
- wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
- (u32)((u64)debugctlmsr),
- (u32)((u64)debugctlmsr >> 32));
-}
-
/*
* from system description table in BIOS. Mostly for MCA use, but
* others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a515..52b098a6eebb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
- PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
- /* requested or actual size of BTS buffer in bytes */
- __u32 size;
- /* bitmask of below flags */
- __u32 flags;
- /* buffer overflow signal */
- __u32 signal;
- /* actual size of bts_struct in bytes */
- __u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
-#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
-#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
- instead of wrapping around */
-#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
-
-#define PTRACE_BTS_CONFIG 40
-/* Configure branch trace recording.
- ADDR points to a struct ptrace_bts_config.
- DATA gives the size of that buffer.
- A new buffer is allocated, if requested in the flags.
- An overflow signal may only be requested for new buffers.
- Returns the number of bytes read.
-*/
-#define PTRACE_BTS_STATUS 41
-/* Return the current configuration in a struct ptrace_bts_config
- pointed to by ADDR; DATA gives the size of that buffer.
- Returns the number of bytes written.
-*/
-#define PTRACE_BTS_SIZE 42
-/* Return the number of available BTS records for draining.
- DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_GET 43
-/* Get a single BTS record.
- DATA defines the index into the BTS array, where 0 is the newest
- entry, and higher indices refer to older entries.
- ADDR is pointing to struct bts_struct (see asm/ds.h).
-*/
-#define PTRACE_BTS_CLEAR 44
-/* Clear the BTS buffer.
- DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_DRAIN 45
-/* Read all available BTS records and clear the buffer.
- ADDR points to an array of struct bts_struct.
- DATA gives the size of that buffer.
- BTS records are read from oldest to newest.
- Returns number of BTS records drained.
-*/
+#endif
#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff0..78cd1ea94500 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate);
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void ptrace_bts_untrace(struct task_struct *tsk);
-
-#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
#endif /* __KERNEL__ */
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e9e341505ab3..d4092fac226b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,8 +92,7 @@ struct thread_info {
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FREEZE 23 /* is freezing for suspend */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
-#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
-#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
+#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
@@ -115,8 +114,7 @@ struct thread_info {
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FREEZE (1 << TIF_FREEZE)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
-#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
-#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -147,7 +145,7 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
+ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
deleted file mode 100644
index e49ed6d2fd4e..000000000000
--- a/arch/x86/include/asm/vmware.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2008, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#ifndef ASM_X86__VMWARE_H
-#define ASM_X86__VMWARE_H
-
-extern void vmware_platform_setup(void);
-extern int vmware_platform(void);
-extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
-
-#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..e77b22083721 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
-obj-$(CONFIG_X86_DS) += ds.o
-obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..70237732a6c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
}
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
static void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
#ifdef CONFIG_SMP
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
{
- u8 **ptr;
+ const s32 *poff;
mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
- continue;
- if (*ptr > text_end)
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn DS segment override prefix into lock prefix */
- text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
};
mutex_unlock(&text_mutex);
}
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
{
- u8 **ptr;
+ const s32 *poff;
if (noreplace_smp)
return;
mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
- continue;
- if (*ptr > text_end)
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn lock prefix into DS segment override prefix */
- text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
};
mutex_unlock(&text_mutex);
}
@@ -276,8 +280,8 @@ struct smp_alt_module {
char *name;
/* ptrs to lock prefixes */
- u8 **locks;
- u8 **locks_end;
+ const s32 *locks;
+ const s32 *locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
@@ -398,16 +402,19 @@ void alternatives_smp_switch(int smp)
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
- u8 **ptr;
+ const s32 *poff;
u8 *text_start = start;
u8 *text_end = end;
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
- for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
- if (text_start <= *ptr && text_end >= *ptr)
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
return 1;
+ }
}
return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f854d89b7edf..fa5a1474cd18 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain,
static u64 *alloc_pte(struct protection_domain *domain,
unsigned long address,
- int end_lvl,
+ unsigned long page_size,
u64 **pte_page,
gfp_t gfp)
{
+ int level, end_lvl;
u64 *pte, *page;
- int level;
+
+ BUG_ON(!is_power_of_2(page_size));
while (address > PM_LEVEL_SIZE(domain->mode))
increase_address_space(domain, gfp);
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ address = PAGE_SIZE_ALIGN(address, page_size);
+ end_lvl = PAGE_SIZE_LEVEL(page_size);
while (level > end_lvl) {
if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
}
+ /* No level skipping support yet */
+ if (PM_PTE_LEVEL(*pte) != level)
+ return NULL;
+
level -= 1;
pte = IOMMU_PTE_PAGE(*pte);
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
* This function checks if there is a PTE for a given dma address. If
* there is one, it returns the pointer to it.
*/
-static u64 *fetch_pte(struct protection_domain *domain,
- unsigned long address, int map_size)
+static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
{
int level;
u64 *pte;
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ if (address > PM_LEVEL_SIZE(domain->mode))
+ return NULL;
+
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- while (level > map_size) {
+ while (level > 0) {
+
+ /* Not Present */
if (!IOMMU_PTE_PRESENT(*pte))
return NULL;
+ /* Large PTE */
+ if (PM_PTE_LEVEL(*pte) == 0x07) {
+ unsigned long pte_mask, __pte;
+
+ /*
+ * If we have a series of large PTEs, make
+ * sure to return a pointer to the first one.
+ */
+ pte_mask = PTE_PAGE_SIZE(*pte);
+ pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+ __pte = ((unsigned long)pte) & pte_mask;
+
+ return (u64 *)__pte;
+ }
+
+ /* No level skipping support yet */
+ if (PM_PTE_LEVEL(*pte) != level)
+ return NULL;
+
level -= 1;
+ /* Walk to the next level */
pte = IOMMU_PTE_PAGE(*pte);
pte = &pte[PM_LEVEL_INDEX(level, address)];
-
- if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
- pte = NULL;
- break;
- }
}
return pte;
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom,
unsigned long bus_addr,
unsigned long phys_addr,
int prot,
- int map_size)
+ unsigned long page_size)
{
u64 __pte, *pte;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
-
- BUG_ON(!PM_ALIGNED(map_size, bus_addr));
- BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+ int i, count;
if (!(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+ bus_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(phys_addr);
+ count = PAGE_SIZE_PTE_COUNT(page_size);
+ pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+
+ for (i = 0; i < count; ++i)
+ if (IOMMU_PTE_PRESENT(pte[i]))
+ return -EBUSY;
- if (IOMMU_PTE_PRESENT(*pte))
- return -EBUSY;
+ if (page_size > PAGE_SIZE) {
+ __pte = PAGE_SIZE_PTE(phys_addr, page_size);
+ __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+ } else
+ __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
- __pte = phys_addr | IOMMU_PTE_P;
if (prot & IOMMU_PROT_IR)
__pte |= IOMMU_PTE_IR;
if (prot & IOMMU_PROT_IW)
__pte |= IOMMU_PTE_IW;
- *pte = __pte;
+ for (i = 0; i < count; ++i)
+ pte[i] = __pte;
update_domain(dom);
return 0;
}
-static void iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr, int map_size)
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long page_size)
{
- u64 *pte = fetch_pte(dom, bus_addr, map_size);
+ unsigned long long unmap_size, unmapped;
+ u64 *pte;
+
+ BUG_ON(!is_power_of_2(page_size));
+
+ unmapped = 0;
- if (pte)
- *pte = 0;
+ while (unmapped < page_size) {
+
+ pte = fetch_pte(dom, bus_addr);
+
+ if (!pte) {
+ /*
+ * No PTE for this address
+ * move forward in 4kb steps
+ */
+ unmap_size = PAGE_SIZE;
+ } else if (PM_PTE_LEVEL(*pte) == 0) {
+ /* 4kb PTE found for this address */
+ unmap_size = PAGE_SIZE;
+ *pte = 0ULL;
+ } else {
+ int count, i;
+
+ /* Large PTE found which maps this address */
+ unmap_size = PTE_PAGE_SIZE(*pte);
+ count = PAGE_SIZE_PTE_COUNT(unmap_size);
+ for (i = 0; i < count; i++)
+ pte[i] = 0ULL;
+ }
+
+ bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+ unmapped += unmap_size;
+ }
+
+ BUG_ON(!is_power_of_2(unmapped));
+
+ return unmapped;
}
/*
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PM_MAP_4k);
+ PAGE_SIZE);
if (ret)
return ret;
/*
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
u64 *pte, *pte_page;
for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+ pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
&pte_page, gfp);
if (!pte)
goto out_free;
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
for (i = dma_dom->aperture[index]->offset;
i < dma_dom->aperture_size;
i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
+ u64 *pte = fetch_pte(&dma_dom->domain, i);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
continue;
@@ -1712,7 +1779,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
if (!pte) {
- pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+ pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
GFP_ATOMIC);
aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
} else
@@ -2439,12 +2506,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}
-static int amd_iommu_map_range(struct iommu_domain *dom,
- unsigned long iova, phys_addr_t paddr,
- size_t size, int iommu_prot)
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+ phys_addr_t paddr, int gfp_order, int iommu_prot)
{
+ unsigned long page_size = 0x1000UL << gfp_order;
struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
int prot = 0;
int ret;
@@ -2453,61 +2519,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
if (iommu_prot & IOMMU_WRITE)
prot |= IOMMU_PROT_IW;
- iova &= PAGE_MASK;
- paddr &= PAGE_MASK;
-
mutex_lock(&domain->api_lock);
-
- for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
- if (ret)
- return ret;
-
- iova += PAGE_SIZE;
- paddr += PAGE_SIZE;
- }
-
+ ret = iommu_map_page(domain, iova, paddr, prot, page_size);
mutex_unlock(&domain->api_lock);
- return 0;
+ return ret;
}
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
- unsigned long iova, size_t size)
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+ int gfp_order)
{
-
struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
+ unsigned long page_size, unmap_size;
- iova &= PAGE_MASK;
+ page_size = 0x1000UL << gfp_order;
mutex_lock(&domain->api_lock);
-
- for (i = 0; i < npages; ++i) {
- iommu_unmap_page(domain, iova, PM_MAP_4k);
- iova += PAGE_SIZE;
- }
+ unmap_size = iommu_unmap_page(domain, iova, page_size);
+ mutex_unlock(&domain->api_lock);
iommu_flush_tlb_pde(domain);
- mutex_unlock(&domain->api_lock);
+ return get_order(unmap_size);
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
unsigned long iova)
{
struct protection_domain *domain = dom->priv;
- unsigned long offset = iova & ~PAGE_MASK;
+ unsigned long offset_mask;
phys_addr_t paddr;
- u64 *pte;
+ u64 *pte, __pte;
- pte = fetch_pte(domain, iova, PM_MAP_4k);
+ pte = fetch_pte(domain, iova);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
return 0;
- paddr = *pte & IOMMU_PAGE_MASK;
- paddr |= offset;
+ if (PM_PTE_LEVEL(*pte) == 0)
+ offset_mask = PAGE_SIZE - 1;
+ else
+ offset_mask = PTE_PAGE_SIZE(*pte) - 1;
+
+ __pte = *pte & PM_ADDR_MASK;
+ paddr = (__pte & ~offset_mask) | (iova & offset_mask);
return paddr;
}
@@ -2523,8 +2578,8 @@ static struct iommu_ops amd_iommu_ops = {
.domain_destroy = amd_iommu_domain_destroy,
.attach_dev = amd_iommu_attach_device,
.detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map_range,
- .unmap = amd_iommu_unmap_range,
+ .map = amd_iommu_map,
+ .unmap = amd_iommu_unmap,
.iova_to_phys = amd_iommu_iova_to_phys,
.domain_has_cap = amd_iommu_domain_has_cap,
};
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d4..3bacb4d0844c 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
bool amd_iommu_dump;
static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
if (no_iommu || (iommu_detected && !gart_iommu_aperture))
return;
+ if (amd_iommu_disabled)
+ return;
+
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
for (; *str; ++str) {
if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
+ if (strncmp(str, "off", 3) == 0)
+ amd_iommu_disabled = true;
}
return 1;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 127b8718abfb..eb2789c3f721 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2545,6 +2545,9 @@ void irq_force_complete_move(int irq)
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg = desc->chip_data;
+ if (!cfg)
+ return;
+
__irq_complete_move(&desc, cfg->vector);
}
#else
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..c4f9182ca3ac 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
#ifdef INIT_TIMER_AFTER_SUSPEND
unsigned long flags;
- spin_lock_irqsave(&i8253_lock, flags);
+ raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
udelay(10);
outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
- spin_unlock_irqrestore(&i8253_lock, flags);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3a785da34b6f 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o sched.o
+obj-y += vmware.o hypervisor.o sched.o mshyperv.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf688..10fa5684a662 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,12 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
+ { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
+ { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+ { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
+ { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
{ 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
# K8 systems. ACPI is preferred to all other hardware-specific drivers.
# speedstep-* is preferred over p4-clockmod.
-obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..1d3cddaa40ee 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -46,6 +46,7 @@
#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
+#include "mperf.h"
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
"acpi-cpufreq", msg)
@@ -71,8 +72,6 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
/* acpi_perf_data is a pointer to percpu data. */
static struct acpi_processor_performance *acpi_perf_data;
@@ -240,45 +239,6 @@ static u32 get_cur_val(const struct cpumask *mask)
return cmd.val;
}
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
- struct aperfmperf *am = _cur;
-
- get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu)
-{
- struct aperfmperf perf;
- unsigned long ratio;
- unsigned int retval;
-
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
- return 0;
-
- ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
- per_cpu(acfreq_old_perf, cpu) = perf;
-
- retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
- return retval;
-}
-
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -702,7 +662,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
/* Check for APERF/MPERF support in hardware */
if (cpu_has(c, X86_FEATURE_APERFMPERF))
- acpi_cpufreq_driver.getavg = get_measured_perf;
+ acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
dprintk("CPU%u - ACPI performance management activated.\n", cpu);
for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+ struct aperfmperf *am = _cur;
+
+ get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+ unsigned int cpu)
+{
+ struct aperfmperf perf;
+ unsigned long ratio;
+ unsigned int retval;
+
+ if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+ return 0;
+
+ ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+ per_cpu(acfreq_old_perf, cpu) = perf;
+
+ retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ * (c) 2010 Advanced Micro Devices, Inc.
+ * Your use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+ unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d360b56e9825..6f3dc8fbbfdc 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
-
/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
+ * (c) 2003-2010 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
@@ -46,6 +45,7 @@
#define PFX "powernow-k8: "
#define VERSION "version 2.20.00"
#include "powernow-k8.h"
+#include "mperf.h"
/* serialize freq changes */
static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
static int cpu_family = CPU_OPTERON;
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
+static struct cpufreq_driver cpufreq_amd64_driver;
+
#ifndef CONFIG_SMP
static inline const struct cpumask *cpu_core_mask(int cpu)
{
@@ -929,7 +935,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
powernow_table[i].index = index;
/* Frequency may be rounded for these */
- if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+ if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+ || boot_cpu_data.x86 == 0x11) {
powernow_table[i].frequency =
freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
} else
@@ -1248,6 +1255,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
struct powernow_k8_data *data;
struct init_on_cpu init_on_cpu;
int rc;
+ struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
if (!cpu_online(pol->cpu))
return -ENODEV;
@@ -1322,6 +1330,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
return -EINVAL;
}
+ /* Check for APERF/MPERF support in hardware */
+ if (cpu_has(c, X86_FEATURE_APERFMPERF))
+ cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
if (cpu_family == CPU_HW_PSTATE)
@@ -1393,8 +1405,77 @@ out:
return khz;
}
+static void _cpb_toggle_msrs(bool t)
+{
+ int cpu;
+
+ get_online_cpus();
+
+ rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+ if (t)
+ reg->l &= ~BIT(25);
+ else
+ reg->l |= BIT(25);
+ }
+ wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+ put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+ if (!cpb_capable)
+ return;
+
+ if (t && !cpb_enabled) {
+ cpb_enabled = true;
+ _cpb_toggle_msrs(t);
+ printk(KERN_INFO PFX "Core Boosting enabled.\n");
+ } else if (!t && cpb_enabled) {
+ cpb_enabled = false;
+ _cpb_toggle_msrs(t);
+ printk(KERN_INFO PFX "Core Boosting disabled.\n");
+ }
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+ size_t count)
+{
+ int ret = -EINVAL;
+ unsigned long val = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (!ret && (val == 0 || val == 1) && cpb_capable)
+ cpb_toggle(val);
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+ return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
static struct freq_attr *powernow_k8_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
+ &cpb,
NULL,
};
@@ -1410,10 +1491,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
.attr = powernow_k8_attr,
};
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
+ void *hcpu)
+{
+ unsigned cpu = (long)hcpu;
+ u32 lo, hi;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+
+ if (!cpb_enabled) {
+ rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+ lo |= BIT(25);
+ wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+ }
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+ lo &= ~BIT(25);
+ wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpb_nb = {
+ .notifier_call = cpb_notify,
+};
+
/* driver entry point for init */
static int __cpuinit powernowk8_init(void)
{
- unsigned int i, supported_cpus = 0;
+ unsigned int i, supported_cpus = 0, cpu;
for_each_online_cpu(i) {
int rc;
@@ -1422,15 +1544,36 @@ static int __cpuinit powernowk8_init(void)
supported_cpus++;
}
- if (supported_cpus == num_online_cpus()) {
- printk(KERN_INFO PFX "Found %d %s "
- "processors (%d cpu cores) (" VERSION ")\n",
- num_online_nodes(),
- boot_cpu_data.x86_model_id, supported_cpus);
- return cpufreq_register_driver(&cpufreq_amd64_driver);
+ if (supported_cpus != num_online_cpus())
+ return -ENODEV;
+
+ printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+ num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+ if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+ cpb_capable = true;
+
+ register_cpu_notifier(&cpb_nb);
+
+ msrs = msrs_alloc();
+ if (!msrs) {
+ printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+ return -ENOMEM;
+ }
+
+ rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+ cpb_enabled |= !(!!(reg->l & BIT(25)));
+ }
+
+ printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+ (cpb_enabled ? "on" : "off"));
}
- return -ENODEV;
+ return cpufreq_register_driver(&cpufreq_amd64_driver);
}
/* driver entry point for term */
@@ -1438,6 +1581,13 @@ static void __exit powernowk8_exit(void)
{
dprintk("exit\n");
+ if (boot_cpu_has(X86_FEATURE_CPB)) {
+ msrs_free(msrs);
+ msrs = NULL;
+
+ unregister_cpu_notifier(&cpb_nb);
+ }
+
cpufreq_unregister_driver(&cpufreq_amd64_driver);
}
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
* http://www.gnu.org/licenses/gpl.html
*/
-
enum pstate {
HW_PSTATE_INVALID = 0xff,
HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
struct cpumask *available_cores;
};
-
/* processor's cpuid instruction support */
#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..dd531cc56a8f 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,55 @@
*
*/
+#include <linux/module.h>
#include <asm/processor.h>
-#include <asm/vmware.h>
#include <asm/hypervisor.h>
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+/*
+ * Hypervisor detect order. This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
- if (vmware_platform())
- c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
- else
- c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
+ &x86_hyper_vmware,
+ &x86_hyper_ms_hyperv,
+};
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+const struct hypervisor_x86 *x86_hyper;
+EXPORT_SYMBOL(x86_hyper);
+
+static inline void __init
+detect_hypervisor_vendor(void)
{
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
- vmware_set_feature_bits(c);
- return;
+ const struct hypervisor_x86 *h, * const *p;
+
+ for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+ h = *p;
+ if (h->detect()) {
+ x86_hyper = h;
+ printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+ break;
+ }
}
}
void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
{
- detect_hypervisor_vendor(c);
- hypervisor_set_feature_bits(c);
+ if (x86_hyper && x86_hyper->set_cpu_features)
+ x86_hyper->set_cpu_features(c);
}
void __init init_hypervisor_platform(void)
{
+
+ detect_hypervisor_vendor();
+
+ if (!x86_hyper)
+ return;
+
init_hypervisor(&boot_cpu_data);
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
- vmware_platform_setup();
+
+ if (x86_hyper->init_platform)
+ x86_hyper->init_platform();
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..85f69cdeae10 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/msr.h>
-#include <asm/ds.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- if (c->cpuid_level > 6) {
- unsigned ecx = cpuid_ecx(6);
- if (ecx & 0x01)
- set_cpu_cap(c, X86_FEATURE_APERFMPERF);
- }
-
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
- ds_init_intel(c);
}
if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a51..33eae2062cf5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
u32 full;
};
+struct amd_l3_cache {
+ struct pci_dev *dev;
+ bool can_disable;
+ unsigned indices;
+ u8 subcaches[4];
+};
+
struct _cpuid4_info {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- bool can_disable;
- unsigned int l3_indices;
+ struct amd_l3_cache *l3;
DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
};
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- bool can_disable;
- unsigned int l3_indices;
+ struct amd_l3_cache *l3;
};
unsigned short num_cache_leaves;
@@ -302,87 +307,163 @@ struct _cache_attr {
};
#ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
{
- /*
- * We're called over smp_call_function_single() and therefore
- * are on the correct cpu.
- */
- int cpu = smp_processor_id();
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
unsigned int sc0, sc1, sc2, sc3;
u32 val = 0;
- pci_read_config_dword(dev, 0x1C4, &val);
+ pci_read_config_dword(l3->dev, 0x1C4, &val);
/* calculate subcache sizes */
- sc0 = !(val & BIT(0));
- sc1 = !(val & BIT(4));
- sc2 = !(val & BIT(8)) + !(val & BIT(9));
- sc3 = !(val & BIT(12)) + !(val & BIT(13));
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
- return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+ l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+{
+ struct amd_l3_cache *l3;
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+
+ l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+ if (!l3) {
+ printk(KERN_WARNING "Error allocating L3 struct\n");
+ return NULL;
+ }
+
+ l3->dev = dev;
+
+ amd_calc_l3_indices(l3);
+
+ return l3;
}
static void __cpuinit
amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
- if (index < 3)
+ int node;
+
+ if (boot_cpu_data.x86 != 0x10)
return;
- if (boot_cpu_data.x86 == 0x11)
+ if (index < 3)
return;
/* see errata #382 and #388 */
- if ((boot_cpu_data.x86 == 0x10) &&
- ((boot_cpu_data.x86_model < 0x8) ||
- (boot_cpu_data.x86_mask < 0x1)))
+ if (boot_cpu_data.x86_model < 0x8)
+ return;
+
+ if ((boot_cpu_data.x86_model == 0x8 ||
+ boot_cpu_data.x86_model == 0x9)
+ &&
+ boot_cpu_data.x86_mask < 0x1)
+ return;
+
+ /* not in virtualized environments */
+ if (num_k8_northbridges == 0)
return;
- this_leaf->can_disable = true;
- this_leaf->l3_indices = amd_calc_l3_indices();
+ /*
+ * Strictly speaking, the amount in @size below is leaked since it is
+ * never freed but this is done only on shutdown so it doesn't matter.
+ */
+ if (!l3_caches) {
+ int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+
+ l3_caches = kzalloc(size, GFP_ATOMIC);
+ if (!l3_caches)
+ return;
+ }
+
+ node = amd_get_nb_id(smp_processor_id());
+
+ if (!l3_caches[node]) {
+ l3_caches[node] = amd_init_l3_cache(node);
+ l3_caches[node]->can_disable = true;
+ }
+
+ WARN_ON(!l3_caches[node]);
+
+ this_leaf->l3 = l3_caches[node];
}
static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int index)
+ unsigned int slot)
{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = amd_get_nb_id(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
+ struct pci_dev *dev = this_leaf->l3->dev;
unsigned int reg = 0;
- if (!this_leaf->can_disable)
+ if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
if (!dev)
return -EINVAL;
- pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
return sprintf(buf, "0x%08x\n", reg);
}
-#define SHOW_CACHE_DISABLE(index) \
+#define SHOW_CACHE_DISABLE(slot) \
static ssize_t \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
{ \
- return show_cache_disable(this_leaf, buf, index); \
+ return show_cache_disable(this_leaf, buf, slot); \
}
SHOW_CACHE_DISABLE(0)
SHOW_CACHE_DISABLE(1)
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+ unsigned slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!l3->subcaches[i])
+ continue;
+
+ pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+ }
+}
+
+
static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count, unsigned int index)
+ const char *buf, size_t count,
+ unsigned int slot)
{
+ struct pci_dev *dev = this_leaf->l3->dev;
int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = amd_get_nb_id(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
unsigned long val = 0;
#define SUBCACHE_MASK (3UL << 20)
#define SUBCACHE_INDEX 0xfff
- if (!this_leaf->can_disable)
+ if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
@@ -396,26 +477,20 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
/* do not allow writes outside of allowed bits */
if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+ ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
return -EINVAL;
- val |= BIT(30);
- pci_write_config_dword(dev, 0x1BC + index * 4, val);
- /*
- * We need to WBINVD on a core on the node containing the L3 cache which
- * indices we disable therefore a simple wbinvd() is not sufficient.
- */
- wbinvd_on_cpu(cpu);
- pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+ amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
+
return count;
}
-#define STORE_CACHE_DISABLE(index) \
+#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
const char *buf, size_t count) \
{ \
- return store_cache_disable(this_leaf, buf, count, index); \
+ return store_cache_disable(this_leaf, buf, count, slot); \
}
STORE_CACHE_DISABLE(0)
STORE_CACHE_DISABLE(1)
@@ -443,8 +518,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- if (boot_cpu_data.x86 >= 0x10)
- amd_check_l3_disable(index, this_leaf);
+ amd_check_l3_disable(index, this_leaf);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
@@ -701,6 +775,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
for (i = 0; i < num_cache_leaves; i++)
cache_remove_shared_cpu_map(cpu, i);
+ kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
kfree(per_cpu(ici_cpuid4_info, cpu));
per_cpu(ici_cpuid4_info, cpu) = NULL;
}
@@ -985,7 +1060,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_leaf = CPUID4_INFO_IDX(cpu, i);
- if (this_leaf->can_disable)
+ if (this_leaf->l3 && this_leaf->l3->can_disable)
ktype_cache.default_attrs = default_l3_attrs;
else
ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- __get_cpu_var(mce_poll_count)++;
+ percpu_inc(mce_poll_count);
mce_setup(&m);
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
atomic_inc(&mce_entry);
- __get_cpu_var(mce_exception_count)++;
+ percpu_inc(mce_exception_count);
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..16f41bbe46b6
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,55 @@
+/*
+ * HyperV Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
+struct ms_hyperv_info ms_hyperv;
+
+static bool __init ms_hyperv_platform(void)
+{
+ u32 eax;
+ u32 hyp_signature[3];
+
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+ &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
+
+ return eax >= HYPERV_CPUID_MIN &&
+ eax <= HYPERV_CPUID_MAX &&
+ !memcmp("Microsoft Hv", hyp_signature, 12);
+}
+
+static void __init ms_hyperv_init_platform(void)
+{
+ /*
+ * Extract the features and hints
+ */
+ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+
+ printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints);
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft HyperV",
+ .detect = ms_hyperv_platform,
+ .init_platform = ms_hyperv_init_platform,
+};
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..fd4db0db3708 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
#include <asm/nmi.h>
#include <asm/compat.h>
-static u64 perf_event_mask __read_mostly;
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) \
+do { \
+ trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
+ (unsigned long)(val)); \
+ native_write_msr((msr), (u32)((u64)(val)), \
+ (u32)((u64)(val) >> 32)); \
+} while (0)
+#endif
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 4
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long offset, addr = (unsigned long)from;
+ int type = in_nmi() ? KM_NMI : KM_IRQ0;
+ unsigned long size, len = 0;
+ struct page *page;
+ void *map;
+ int ret;
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE 24
+ do {
+ ret = __get_user_pages_fast(addr, 1, 0, &page);
+ if (!ret)
+ break;
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
+ offset = addr & (PAGE_SIZE - 1);
+ size = min(PAGE_SIZE - offset, n - len);
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
+ map = kmap_atomic(page, type);
+ memcpy(to, map+offset, size);
+ kunmap_atomic(map, type);
+ put_page(page);
+ len += size;
+ to += size;
+ addr += size;
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR (1 << 6)
-#define X86_DEBUGCTL_BTS (1 << 7)
-#define X86_DEBUGCTL_BTINT (1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
+ } while (len < n);
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
- u64 bts_buffer_base;
- u64 bts_index;
- u64 bts_absolute_maximum;
- u64 bts_interrupt_threshold;
- u64 pebs_buffer_base;
- u64 pebs_index;
- u64 pebs_absolute_maximum;
- u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
-};
+ return len;
+}
struct event_constraint {
union {
@@ -89,18 +94,41 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
+#define MAX_LBR_ENTRIES 16
+
struct cpu_hw_events {
+ /*
+ * Generic x86 PMC bits
+ */
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long interrupts;
int enabled;
- struct debug_store *ds;
int n_events;
int n_added;
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+
+ unsigned int group_flag;
+
+ /*
+ * Intel DebugStore bits
+ */
+ struct debug_store *ds;
+ u64 pebs_enabled;
+
+ /*
+ * Intel LBR bits
+ */
+ int lbr_users;
+ void *lbr_context;
+ struct perf_branch_stack lbr_stack;
+ struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+
+ /*
+ * AMD specific bits
+ */
struct amd_nb *amd_nb;
};
@@ -114,44 +142,75 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+/*
+ * Constraint on the Event code.
+ */
#define INTEL_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ * - inv
+ * - edge
+ * - cnt-mask
+ * The other filters are supported by fixed counters.
+ * The any-thread option is supported starting with v3.
+ */
#define FIXED_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
#define EVENT_CONSTRAINT_END \
EVENT_CONSTRAINT(0, 0, 0)
#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->cmask; (e)++)
+ for ((e) = (c); (e)->weight; (e)++)
+
+union perf_capabilities {
+ struct {
+ u64 lbr_format : 6;
+ u64 pebs_trap : 1;
+ u64 pebs_arch_reg : 1;
+ u64 pebs_format : 4;
+ u64 smm_freeze : 1;
+ };
+ u64 capabilities;
+};
/*
* struct x86_pmu - generic x86 pmu
*/
struct x86_pmu {
+ /*
+ * Generic x86 PMC bits
+ */
const char *name;
int version;
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
- void (*enable_all)(void);
+ void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ int (*hw_config)(struct perf_event *event);
+ int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
unsigned perfctr;
u64 (*event_map)(int);
- u64 (*raw_event)(u64);
int max_events;
- int num_events;
- int num_events_fixed;
- int event_bits;
- u64 event_mask;
+ int num_counters;
+ int num_counters_fixed;
+ int cntval_bits;
+ u64 cntval_mask;
int apic;
u64 max_period;
- u64 intel_ctrl;
- void (*enable_bts)(u64 config);
- void (*disable_bts)(void);
-
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
@@ -159,11 +218,32 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
+ void (*quirks)(void);
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
+
+ /*
+ * Intel Arch Perfmon v2+
+ */
+ u64 intel_ctrl;
+ union perf_capabilities intel_cap;
+
+ /*
+ * Intel DebugStore bits
+ */
+ int bts, pebs;
+ int pebs_record_size;
+ void (*drain_pebs)(struct pt_regs *regs);
+ struct event_constraint *pebs_constraints;
+
+ /*
+ * Intel LBR
+ */
+ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
+ int lbr_nr; /* hardware stack size */
};
static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +278,7 @@ static u64
x86_perf_event_update(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- int shift = 64 - x86_pmu.event_bits;
+ int shift = 64 - x86_pmu.cntval_bits;
u64 prev_raw_count, new_raw_count;
int idx = hwc->idx;
s64 delta;
@@ -241,33 +321,32 @@ again:
static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
+#ifdef CONFIG_X86_LOCAL_APIC
+
static bool reserve_pmc_hardware(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
int i;
if (nmi_watchdog == NMI_LOCAL_APIC)
disable_lapic_nmi_watchdog();
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
goto perfctr_fail;
}
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
goto eventsel_fail;
}
-#endif
return true;
-#ifdef CONFIG_X86_LOCAL_APIC
eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu.eventsel + i);
- i = x86_pmu.num_events;
+ i = x86_pmu.num_counters;
perfctr_fail:
for (i--; i >= 0; i--)
@@ -277,128 +356,36 @@ perfctr_fail:
enable_lapic_nmi_watchdog();
return false;
-#endif
}
static void release_pmc_hardware(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
int i;
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
release_perfctr_nmi(x86_pmu.perfctr + i);
release_evntsel_nmi(x86_pmu.eventsel + i);
}
if (nmi_watchdog == NMI_LOCAL_APIC)
enable_lapic_nmi_watchdog();
-#endif
-}
-
-static inline bool bts_available(void)
-{
- return x86_pmu.enable_bts != NULL;
}
-static void init_debug_store_on_cpu(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
- (u32)((u64)(unsigned long)ds),
- (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static void fini_debug_store_on_cpu(int cpu)
-{
- if (!per_cpu(cpu_hw_events, cpu).ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
- int cpu;
-
- if (!bts_available())
- return;
-
- get_online_cpus();
-
- for_each_online_cpu(cpu)
- fini_debug_store_on_cpu(cpu);
-
- for_each_possible_cpu(cpu) {
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- continue;
-
- per_cpu(cpu_hw_events, cpu).ds = NULL;
-
- kfree((void *)(unsigned long)ds->bts_buffer_base);
- kfree(ds);
- }
-
- put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
- int cpu, err = 0;
-
- if (!bts_available())
- return 0;
-
- get_online_cpus();
-
- for_each_possible_cpu(cpu) {
- struct debug_store *ds;
- void *buffer;
-
- err = -ENOMEM;
- buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- ds = kzalloc(sizeof(*ds), GFP_KERNEL);
- if (unlikely(!ds)) {
- kfree(buffer);
- break;
- }
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum =
- ds->bts_buffer_base + BTS_BUFFER_SIZE;
- ds->bts_interrupt_threshold =
- ds->bts_absolute_maximum - BTS_OVFL_TH;
-
- per_cpu(cpu_hw_events, cpu).ds = ds;
- err = 0;
- }
+#else
- if (err)
- release_bts_hardware();
- else {
- for_each_online_cpu(cpu)
- init_debug_store_on_cpu(cpu);
- }
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
- put_online_cpus();
+#endif
- return err;
-}
+static int reserve_ds_buffers(void);
+static void release_ds_buffers(void);
static void hw_perf_event_destroy(struct perf_event *event)
{
if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
release_pmc_hardware();
- release_bts_hardware();
+ release_ds_buffers();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -441,54 +428,11 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return 0;
}
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_event_init(struct perf_event *event)
+static int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
struct hw_perf_event *hwc = &event->hw;
u64 config;
- int err;
-
- if (!x86_pmu_initialized())
- return -ENODEV;
-
- err = 0;
- if (!atomic_inc_not_zero(&active_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_events) == 0) {
- if (!reserve_pmc_hardware())
- err = -EBUSY;
- else
- err = reserve_bts_hardware();
- }
- if (!err)
- atomic_inc(&active_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
-
- event->destroy = hw_perf_event_destroy;
-
- /*
- * Generate PMC IRQs:
- * (keep 'enabled' bit clear for now)
- */
- hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
- hwc->idx = -1;
- hwc->last_cpu = -1;
- hwc->last_tag = ~0ULL;
-
- /*
- * Count user and OS events unless requested not to.
- */
- if (!attr->exclude_user)
- hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
- if (!attr->exclude_kernel)
- hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
if (!hwc->sample_period) {
hwc->sample_period = x86_pmu.max_period;
@@ -505,16 +449,8 @@ static int __hw_perf_event_init(struct perf_event *event)
return -EOPNOTSUPP;
}
- /*
- * Raw hw_event type provide the config in the hw_event structure
- */
- if (attr->type == PERF_TYPE_RAW) {
- hwc->config |= x86_pmu.raw_event(attr->config);
- if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
- perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (attr->type == PERF_TYPE_RAW)
return 0;
- }
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);
@@ -539,11 +475,11 @@ static int __hw_perf_event_init(struct perf_event *event)
if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
(hwc->sample_period == 1)) {
/* BTS is not supported by this architecture. */
- if (!bts_available())
+ if (!x86_pmu.bts)
return -EOPNOTSUPP;
/* BTS is currently only allowed for user-mode. */
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ if (!attr->exclude_kernel)
return -EOPNOTSUPP;
}
@@ -552,12 +488,87 @@ static int __hw_perf_event_init(struct perf_event *event)
return 0;
}
+static int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = 0;
+
+ /* Support for constant skid */
+ if (x86_pmu.pebs)
+ precise++;
+
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr)
+ precise++;
+
+ if (event->attr.precise_ip > precise)
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to
+ */
+ if (!event->attr.exclude_user)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!event->attr.exclude_kernel)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (event->attr.type == PERF_TYPE_RAW)
+ event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+ return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_events)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_events) == 0) {
+ if (!reserve_pmc_hardware())
+ err = -EBUSY;
+ else {
+ err = reserve_ds_buffers();
+ if (err)
+ release_pmc_hardware();
+ }
+ }
+ if (!err)
+ atomic_inc(&active_events);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ if (err)
+ return err;
+
+ event->destroy = hw_perf_event_destroy;
+
+ event->hw.idx = -1;
+ event->hw.last_cpu = -1;
+ event->hw.last_tag = ~0ULL;
+
+ return x86_pmu.hw_config(event);
+}
+
static void x86_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
u64 val;
if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +598,12 @@ void hw_perf_disable(void)
x86_pmu.disable_all();
}
-static void x86_pmu_enable_all(void)
+static void x86_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
struct perf_event *event = cpuc->events[idx];
u64 val;
@@ -667,14 +678,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* assign events to counters starting with most
* constrained events.
*/
- wmax = x86_pmu.num_events;
+ wmax = x86_pmu.num_counters;
/*
* when fixed event counters are present,
* wmax is incremented by 1 to account
* for one more choice
*/
- if (x86_pmu.num_events_fixed)
+ if (x86_pmu.num_counters_fixed)
wmax++;
for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +735,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
struct perf_event *event;
int n, max_count;
- max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+ max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
/* current number of events already accepted */
n = cpuc->n_events;
@@ -795,7 +806,7 @@ void hw_perf_enable(void)
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct perf_event *event;
struct hw_perf_event *hwc;
- int i;
+ int i, added = cpuc->n_added;
if (!x86_pmu_initialized())
return;
@@ -847,19 +858,20 @@ void hw_perf_enable(void)
cpuc->enabled = 1;
barrier();
- x86_pmu.enable_all();
+ x86_pmu.enable_all(added);
}
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+ u64 enable_mask)
{
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
}
static inline void x86_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config);
}
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -874,7 +886,7 @@ x86_perf_event_set_period(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
s64 left = atomic64_read(&hwc->period_left);
s64 period = hwc->sample_period;
- int err, ret = 0, idx = hwc->idx;
+ int ret = 0, idx = hwc->idx;
if (idx == X86_PMC_IDX_FIXED_BTS)
return 0;
@@ -912,8 +924,8 @@ x86_perf_event_set_period(struct perf_event *event)
*/
atomic64_set(&hwc->prev_count, (u64)-left);
- err = checking_wrmsrl(hwc->event_base + idx,
- (u64)(-left) & x86_pmu.event_mask);
+ wrmsrl(hwc->event_base + idx,
+ (u64)(-left) & x86_pmu.cntval_mask);
perf_event_update_userpage(event);
@@ -924,7 +936,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
if (cpuc->enabled)
- __x86_pmu_enable_event(&event->hw);
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
@@ -950,7 +963,15 @@ static int x86_pmu_enable(struct perf_event *event)
if (n < 0)
return n;
- ret = x86_schedule_events(cpuc, n, assign);
+ /*
+ * If group events scheduling transaction was started,
+ * skip the schedulability test here, it will be peformed
+ * at commit time(->commit_txn) as a whole
+ */
+ if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+ goto out;
+
+ ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
return ret;
/*
@@ -959,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
+out:
cpuc->n_events = n;
cpuc->n_added += n - n0;
@@ -991,11 +1013,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ u64 pebs;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
- if (!x86_pmu.num_events)
+ if (!x86_pmu.num_counters)
return;
local_irq_save(flags);
@@ -1008,16 +1031,18 @@ void perf_event_print_debug(void)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
}
- pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
@@ -1030,7 +1055,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
- for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1095,7 +1120,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -1103,7 +1128,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
hwc = &event->hw;
val = x86_perf_event_update(event);
- if (val & (1ULL << (x86_pmu.event_bits - 1)))
+ if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
continue;
/*
@@ -1146,7 +1171,6 @@ void set_perf_event_pending(void)
void perf_events_lapic_init(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
if (!x86_pmu.apic || !x86_pmu_initialized())
return;
@@ -1154,7 +1178,6 @@ void perf_events_lapic_init(void)
* Always use NMI for PMU
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
}
static int __kprobes
@@ -1178,9 +1201,7 @@ perf_event_nmi_handler(struct notifier_block *self,
regs = args->regs;
-#ifdef CONFIG_X86_LOCAL_APIC
apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
/*
* Can't rely on the handled return value to say it was our NMI, two
* events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1238,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return &unconstrained;
}
-static int x86_event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx)
-{
- int ret = 0;
-
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
- event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-
- if (!is_x86_event(event))
- ret = event->pmu->enable(event);
-
- if (!ret && !is_software_event(event))
- cpuctx->active_oncpu++;
-
- if (!ret && event->attr.exclusive)
- cpuctx->exclusive = 1;
-
- return ret;
-}
-
-static void x86_event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx)
-{
- event->state = PERF_EVENT_STATE_INACTIVE;
- event->oncpu = -1;
-
- if (!is_x86_event(event))
- event->pmu->disable(event);
-
- event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
-
- if (!is_software_event(event))
- cpuctx->active_oncpu--;
-
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
-}
-
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- *
- * called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-int hw_perf_group_sched_in(struct perf_event *leader,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct perf_event *sub;
- int assign[X86_PMC_IDX_MAX];
- int n0, n1, ret;
-
- /* n0 = total number of events */
- n0 = collect_events(cpuc, leader, true);
- if (n0 < 0)
- return n0;
-
- ret = x86_schedule_events(cpuc, n0, assign);
- if (ret)
- return ret;
-
- ret = x86_event_sched_in(leader, cpuctx);
- if (ret)
- return ret;
-
- n1 = 1;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- if (sub->state > PERF_EVENT_STATE_OFF) {
- ret = x86_event_sched_in(sub, cpuctx);
- if (ret)
- goto undo;
- ++n1;
- }
- }
- /*
- * copy new assignment, now we know it is possible
- * will be used by hw_perf_enable()
- */
- memcpy(cpuc->assign, assign, n0*sizeof(int));
-
- cpuc->n_events = n0;
- cpuc->n_added += n1;
- ctx->nr_active += n1;
-
- /*
- * 1 means successful and events are active
- * This is not quite true because we defer
- * actual activation until hw_perf_enable() but
- * this way we* ensure caller won't try to enable
- * individual events
- */
- return 1;
-undo:
- x86_event_sched_out(leader, cpuctx);
- n0 = 1;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- x86_event_sched_out(sub, cpuctx);
- if (++n0 == n1)
- break;
- }
- }
- return ret;
-}
-
#include "perf_event_amd.c"
#include "perf_event_p6.c"
+#include "perf_event_p4.c"
+#include "perf_event_intel_lbr.c"
+#include "perf_event_intel_ds.c"
#include "perf_event_intel.c"
static int __cpuinit
@@ -1402,48 +1316,50 @@ void __init init_hw_perf_events(void)
pr_cont("%s PMU driver.\n", x86_pmu.name);
- if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+ if (x86_pmu.quirks)
+ x86_pmu.quirks();
+
+ if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_events, X86_PMC_MAX_GENERIC);
- x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+ x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+ x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
}
- perf_event_mask = (1 << x86_pmu.num_events) - 1;
- perf_max_events = x86_pmu.num_events;
+ x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+ perf_max_events = x86_pmu.num_counters;
- if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+ if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
- x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+ x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
}
- perf_event_mask |=
- ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
- x86_pmu.intel_ctrl = perf_event_mask;
+ x86_pmu.intel_ctrl |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
perf_events_lapic_init();
register_die_notifier(&perf_event_nmi_notifier);
unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
- 0, x86_pmu.num_events);
+ __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+ 0, x86_pmu.num_counters);
if (x86_pmu.event_constraints) {
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != INTEL_ARCH_FIXED_MASK)
+ if (c->cmask != X86_RAW_EVENT_MASK)
continue;
- c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
- c->weight += x86_pmu.num_events;
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ c->weight += x86_pmu.num_counters;
}
}
pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.event_bits);
- pr_info("... generic registers: %d\n", x86_pmu.num_events);
- pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic registers: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
- pr_info("... event mask: %016Lx\n", perf_event_mask);
+ pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
perf_cpu_notifier(x86_pmu_notifier);
}
@@ -1453,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+static void x86_pmu_start_txn(const struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(const struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+static int x86_pmu_commit_txn(const struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int assign[X86_PMC_IDX_MAX];
+ int n, ret;
+
+ n = cpuc->n_events;
+
+ if (!x86_pmu_initialized())
+ return -EAGAIN;
+
+ ret = x86_pmu.schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
+
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
+
+ return 0;
+}
+
static const struct pmu pmu = {
.enable = x86_pmu_enable,
.disable = x86_pmu_disable,
@@ -1460,9 +1429,38 @@ static const struct pmu pmu = {
.stop = x86_pmu_stop,
.read = x86_pmu_read,
.unthrottle = x86_pmu_unthrottle,
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
};
/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+ struct cpu_hw_events *fake_cpuc;
+ struct event_constraint *c;
+ int ret = 0;
+
+ fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+ if (!fake_cpuc)
+ return -ENOMEM;
+
+ c = x86_pmu.get_event_constraints(fake_cpuc, event);
+
+ if (!c || !c->weight)
+ ret = -ENOSPC;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(fake_cpuc, event);
+
+ kfree(fake_cpuc);
+
+ return ret;
+}
+
+/*
* validate a single event group
*
* validation include:
@@ -1502,7 +1500,7 @@ static int validate_group(struct perf_event *event)
fake_cpuc->n_events = n;
- ret = x86_schedule_events(fake_cpuc, n, NULL);
+ ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
out_free:
kfree(fake_cpuc);
@@ -1527,6 +1525,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
if (event->group_leader != event)
err = validate_group(event);
+ else
+ err = validate_event(event);
event->pmu = tmp;
}
@@ -1574,8 +1574,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- if (reliable)
- callchain_store(entry, addr);
+ callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
@@ -1597,41 +1596,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- int type = in_nmi() ? KM_NMI : KM_IRQ0;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
-
- map = kmap_atomic(page, type);
- memcpy(to, map+offset, size);
- kunmap_atomic(map, type);
- put_page(page);
-
- len += size;
- to += size;
- addr += size;
-
- } while (len < n);
-
- return len;
-}
-
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1691,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
struct perf_callchain_entry *entry;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return NULL;
+ }
+
if (in_nmi())
entry = &__get_cpu_var(pmc_nmi_entry);
else
@@ -1750,3 +1719,37 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
regs->cs = __KERNEL_CS;
local_save_flags(regs->flags);
}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+ unsigned long ip;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+ ip = perf_guest_cbs->get_guest_ip();
+ else
+ ip = instruction_pointer(regs);
+
+ return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+ int misc = 0;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_cbs->is_user_mode())
+ misc |= PERF_RECORD_MISC_GUEST_USER;
+ else
+ misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+ } else {
+ if (user_mode(regs))
+ misc |= PERF_RECORD_MISC_USER;
+ else
+ misc |= PERF_RECORD_MISC_KERNEL;
+ }
+
+ if (regs->flags & PERF_EFLAGS_EXACT)
+ misc |= PERF_RECORD_MISC_EXACT_IP;
+
+ return misc;
+}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..611df11ba15e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-static __initconst u64 amd_hw_cache_event_ids
+static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
}
-static u64 amd_pmu_raw_event(u64 hw_event)
+static int amd_pmu_hw_config(struct perf_event *event)
{
-#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
-#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
-#define K7_EVNTSEL_INV_MASK 0x000800000ULL
-#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK \
- (K7_EVNTSEL_EVENT_MASK | \
- K7_EVNTSEL_UNIT_MASK | \
- K7_EVNTSEL_EDGE_MASK | \
- K7_EVNTSEL_INV_MASK | \
- K7_EVNTSEL_REG_MASK)
-
- return hw_event & K7_EVNTSEL_MASK;
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ return 0;
+
+ event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+ return 0;
}
/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* be removed on one CPU at a time AND PMU is disabled
* when we come here
*/
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
if (nb->owners[i] == event) {
cmpxchg(nb->owners+i, event, NULL);
break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
struct perf_event *old = NULL;
- int max = x86_pmu.num_events;
+ int max = x86_pmu.num_counters;
int i, j, k = -1;
/*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
/*
* initialize all possible NB constraints
*/
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
__set_bit(i, nb->event_constraints[i].idxmsk);
nb->event_constraints[i].weight = 1;
}
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
raw_spin_unlock(&amd_nb_lock);
}
-static __initconst struct x86_pmu amd_pmu = {
+static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_config = amd_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
.perfctr = MSR_K7_PERFCTR0,
.event_map = amd_pmu_event_map,
- .raw_event = amd_pmu_raw_event,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_events = 4,
- .event_bits = 48,
- .event_mask = (1ULL << 48) - 1,
+ .num_counters = 4,
+ .cntval_bits = 48,
+ .cntval_mask = (1ULL << 48) - 1,
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..fdbc652d3feb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
-static __initconst u64 westmere_hw_cache_event_ids
+static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
},
};
-static __initconst u64 nehalem_hw_cache_event_ids
+static __initconst const u64 nehalem_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
},
};
-static __initconst u64 core2_hw_cache_event_ids
+static __initconst const u64 core2_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
},
};
-static __initconst u64 atom_hw_cache_event_ids
+static __initconst const u64 atom_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
},
};
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK \
- (INTEL_ARCH_EVTSEL_MASK | \
- INTEL_ARCH_UNIT_MASK | \
- INTEL_ARCH_EDGE_MASK | \
- INTEL_ARCH_INV_MASK | \
- INTEL_ARCH_CNT_MASK)
-
- return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static void intel_pmu_enable_bts(u64 config)
-{
- unsigned long debugctlmsr;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr |= X86_DEBUGCTL_TR;
- debugctlmsr |= X86_DEBUGCTL_BTS;
- debugctlmsr |= X86_DEBUGCTL_BTINT;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_OS))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_USR))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long debugctlmsr;
-
- if (!cpuc->ds)
- return;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr &=
- ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
- X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
- update_debugctlmsr(debugctlmsr);
-}
-
static void intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +460,17 @@ static void intel_pmu_disable_all(void)
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
+
+ intel_pmu_pebs_disable_all();
+ intel_pmu_lbr_disable_all();
}
-static void intel_pmu_enable_all(void)
+static void intel_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ intel_pmu_pebs_enable_all();
+ intel_pmu_lbr_enable_all();
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +484,42 @@ static void intel_pmu_enable_all(void)
}
}
+/*
+ * Workaround for:
+ * Intel Errata AAK100 (model 26)
+ * Intel Errata AAP53 (model 30)
+ * Intel Errata BD53 (model 44)
+ *
+ * These chips need to be 'reset' when adding counters by programming
+ * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ * either in sequence on the same PMC or on different PMCs.
+ */
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added) {
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int i;
+
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+ for (i = 0; i < 3; i++) {
+ struct perf_event *event = cpuc->events[i];
+
+ if (!event)
+ continue;
+
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ }
+ }
+ intel_pmu_enable_all(added);
+}
+
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -547,8 +534,7 @@ static inline void intel_pmu_ack_status(u64 ack)
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, mask;
@@ -557,71 +543,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
rdmsrl(hwc->config_base, ctrl_val);
ctrl_val &= ~mask;
- (void)checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_drain_bts_buffer(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct bts_record {
- u64 from;
- u64 to;
- u64 flags;
- };
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
- struct perf_output_handle handle;
- struct perf_event_header header;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!event)
- return;
-
- if (!ds)
- return;
-
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
-
- if (top <= at)
- return;
-
- ds->bts_index = ds->bts_buffer_base;
-
- perf_sample_data_init(&data, 0);
-
- data.period = event->hw.last_period;
- regs.ip = 0;
-
- /*
- * Prepare a generic sample, i.e. fill in the invariant fields.
- * We will overwrite the from and to address before we output
- * the sample.
- */
- perf_prepare_sample(&header, &data, event, &regs);
-
- if (perf_output_begin(&handle, event,
- header.size * (top - at), 1, 1))
- return;
-
- for (; at < top; at++) {
- data.ip = at->from;
- data.addr = at->to;
-
- perf_output_sample(&handle, &header, &data, event);
- }
-
- perf_output_end(&handle);
-
- /* There's new data available. */
- event->hw.interrupts++;
- event->pending_kill = POLL_IN;
+ wrmsrl(hwc->config_base, ctrl_val);
}
-static inline void
-intel_pmu_disable_event(struct perf_event *event)
+static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -637,14 +562,15 @@ intel_pmu_disable_event(struct perf_event *event)
}
x86_pmu_disable_event(event);
+
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_disable(event);
}
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, bits, mask;
- int err;
/*
* Enable IRQ generation (0x8),
@@ -669,7 +595,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
rdmsrl(hwc->config_base, ctrl_val);
ctrl_val &= ~mask;
ctrl_val |= bits;
- err = checking_wrmsrl(hwc->config_base, ctrl_val);
+ wrmsrl(hwc->config_base, ctrl_val);
}
static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +615,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
return;
}
- __x86_pmu_enable_event(hwc);
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_enable(event);
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
@@ -708,20 +637,20 @@ static void intel_pmu_reset(void)
unsigned long flags;
int idx;
- if (!x86_pmu.num_events)
+ if (!x86_pmu.num_counters)
return;
local_irq_save(flags);
printk("clearing PMU state on CPU#%d\n", smp_processor_id());
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
}
- for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
- }
+
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -747,7 +676,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
if (!status) {
- intel_pmu_enable_all();
+ intel_pmu_enable_all(0);
return 0;
}
@@ -762,6 +691,15 @@ again:
inc_irq_stat(apic_perf_irqs);
ack = status;
+
+ intel_pmu_lbr_read();
+
+ /*
+ * PEBS overflow sets bit 62 in the global status register
+ */
+ if (__test_and_clear_bit(62, (unsigned long *)&status))
+ x86_pmu.drain_pebs(regs);
+
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
@@ -787,26 +725,22 @@ again:
goto again;
done:
- intel_pmu_enable_all();
+ intel_pmu_enable_all(0);
return 1;
}
-static struct event_constraint bts_constraint =
- EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
-
static struct event_constraint *
-intel_special_constraints(struct perf_event *event)
+intel_bts_constraints(struct perf_event *event)
{
- unsigned int hw_event;
-
- hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int hw_event, bts_event;
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (event->hw.sample_period == 1))) {
+ hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+ bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
return &bts_constraint;
- }
+
return NULL;
}
@@ -815,24 +749,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
{
struct event_constraint *c;
- c = intel_special_constraints(event);
+ c = intel_bts_constraints(event);
+ if (c)
+ return c;
+
+ c = intel_pebs_constraints(event);
if (c)
return c;
return x86_get_event_constraints(cpuc, event);
}
-static __initconst struct x86_pmu core_pmu = {
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ return 0;
+
+ if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+ return 0;
+
+ if (x86_pmu.version < 3)
+ return -EINVAL;
+
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+ return 0;
+}
+
+static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
/*
@@ -845,17 +808,32 @@ static __initconst struct x86_pmu core_pmu = {
.event_constraints = intel_core_event_constraints,
};
-static __initconst struct x86_pmu intel_pmu = {
+static void intel_pmu_cpu_starting(int cpu)
+{
+ init_debug_store_on_cpu(cpu);
+ /*
+ * Deal with CPUs that don't clear their LBRs on power-up.
+ */
+ intel_pmu_lbr_reset();
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+ fini_debug_store_on_cpu(cpu);
+}
+
+static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
.disable_all = intel_pmu_disable_all,
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .hw_config = intel_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
/*
@@ -864,14 +842,38 @@ static __initconst struct x86_pmu intel_pmu = {
* the generic event period:
*/
.max_period = (1ULL << 31) - 1,
- .enable_bts = intel_pmu_enable_bts,
- .disable_bts = intel_pmu_disable_bts,
.get_event_constraints = intel_get_event_constraints,
- .cpu_starting = init_debug_store_on_cpu,
- .cpu_dying = fini_debug_store_on_cpu,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
};
+static void intel_clovertown_quirks(void)
+{
+ /*
+ * PEBS is unreliable due to:
+ *
+ * AJ67 - PEBS may experience CPL leaks
+ * AJ68 - PEBS PMI may be delayed by one event
+ * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+ * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+ *
+ * AJ67 could be worked around by restricting the OS/USR flags.
+ * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+ *
+ * AJ106 could possibly be worked around by not allowing LBR
+ * usage from PEBS, including the fixup.
+ * AJ68 could possibly be worked around by always programming
+ * a pebs_event_reset[0] value and coping with the lost events.
+ *
+ * But taken together it might just make sense to not enable PEBS on
+ * these chips.
+ */
+ printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+ x86_pmu.pebs = 0;
+ x86_pmu.pebs_constraints = NULL;
+}
+
static __init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -881,12 +883,13 @@ static __init int intel_pmu_init(void)
int version;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- /* check for P6 processor family */
- if (boot_cpu_data.x86 == 6) {
- return p6_pmu_init();
- } else {
+ switch (boot_cpu_data.x86) {
+ case 0x6:
+ return p6_pmu_init();
+ case 0xf:
+ return p4_pmu_init();
+ }
return -ENODEV;
- }
}
/*
@@ -904,16 +907,28 @@ static __init int intel_pmu_init(void)
x86_pmu = intel_pmu;
x86_pmu.version = version;
- x86_pmu.num_events = eax.split.num_events;
- x86_pmu.event_bits = eax.split.bit_width;
- x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.cntval_bits = eax.split.bit_width;
+ x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events:
*/
if (version > 1)
- x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+ /*
+ * v2 and above have a perf capabilities MSR
+ */
+ if (version > 1) {
+ u64 capabilities;
+
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+ x86_pmu.intel_cap.capabilities = capabilities;
+ }
+
+ intel_ds_init();
/*
* Install the hw-cache-events table:
@@ -924,12 +939,15 @@ static __init int intel_pmu_init(void)
break;
case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ x86_pmu.quirks = intel_clovertown_quirks;
case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
case 29: /* six-core 45 nm xeon "Dunnington" */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_core();
+
x86_pmu.event_constraints = intel_core2_event_constraints;
pr_cont("Core2 events, ");
break;
@@ -940,13 +958,19 @@ static __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_nhm();
+
x86_pmu.event_constraints = intel_nehalem_event_constraints;
- pr_cont("Nehalem/Corei7 events, ");
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ pr_cont("Nehalem events, ");
break;
+
case 28: /* Atom */
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_atom();
+
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("Atom events, ");
break;
@@ -956,7 +980,10 @@ static __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_nhm();
+
x86_pmu.event_constraints = intel_westmere_event_constraints;
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
pr_cont("Westmere events, ");
break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS 4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE 24
+
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+ u32 flags, ip;
+ u32 ax, bc, cx, dx;
+ u32 si, di, bp, sp;
+};
+
+ */
+
+struct pebs_record_core {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+};
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+static void init_debug_store_on_cpu(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)(unsigned long)ds),
+ (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static void fini_debug_store_on_cpu(int cpu)
+{
+ if (!per_cpu(cpu_hw_events, cpu).ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_ds_buffers(void)
+{
+ int cpu;
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ fini_debug_store_on_cpu(cpu);
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ continue;
+
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+ kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ kfree((void *)(unsigned long)ds->bts_buffer_base);
+ kfree(ds);
+ }
+
+ put_online_cpus();
+}
+
+static int reserve_ds_buffers(void)
+{
+ int cpu, err = 0;
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return 0;
+
+ get_online_cpus();
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds;
+ void *buffer;
+ int max, thresh;
+
+ err = -ENOMEM;
+ ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+ if (unlikely(!ds))
+ break;
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+
+ if (x86_pmu.bts) {
+ buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ thresh = max / 16;
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ thresh * BTS_RECORD_SIZE;
+ }
+
+ if (x86_pmu.pebs) {
+ buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+ ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ ds->pebs_index = ds->pebs_buffer_base;
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+ max * x86_pmu.pebs_record_size;
+ /*
+ * Always use single record PEBS
+ */
+ ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+ x86_pmu.pebs_record_size;
+ }
+
+ err = 0;
+ }
+
+ if (err)
+ release_ds_buffers();
+ else {
+ for_each_online_cpu(cpu)
+ init_debug_store_on_cpu(cpu);
+ }
+
+ put_online_cpus();
+
+ return err;
+}
+
+/*
+ * BTS
+ */
+
+static struct event_constraint bts_constraint =
+ EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= DEBUGCTLMSR_TR;
+ debugctlmsr |= DEBUGCTLMSR_BTS;
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+ DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ struct bts_record *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!event)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ if (top <= at)
+ return;
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ perf_sample_data_init(&data, 0);
+ data.period = event->hw.last_period;
+ regs.ip = 0;
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+ return;
+
+ for (; at < top; at++) {
+ data.ip = at->from;
+ data.addr = at->to;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ /* There's new data available. */
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+}
+
+/*
+ * PEBS
+ */
+
+static struct event_constraint intel_core_pebs_events[] = {
+ PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+ PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+ PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_events[] = {
+ PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+ PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+ PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+ PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint *
+intel_pebs_constraints(struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ if (!event->attr.precise_ip)
+ return NULL;
+
+ if (x86_pmu.pebs_constraints) {
+ for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &emptyconstraint;
+}
+
+static void intel_pmu_pebs_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+ cpuc->pebs_enabled |= 1ULL << hwc->idx;
+ WARN_ON_ONCE(cpuc->enabled);
+
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+ intel_pmu_lbr_enable(event);
+}
+
+static void intel_pmu_pebs_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+ if (cpuc->enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+ hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+ intel_pmu_lbr_disable(event);
+}
+
+static void intel_pmu_pebs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+static void intel_pmu_pebs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+#include <asm/insn.h>
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+ return ip > PAGE_OFFSET;
+#else
+ return (long)ip < 0;
+#endif
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ unsigned long from = cpuc->lbr_entries[0].from;
+ unsigned long old_to, to = cpuc->lbr_entries[0].to;
+ unsigned long ip = regs->ip;
+
+ /*
+ * We don't need to fixup if the PEBS assist is fault like
+ */
+ if (!x86_pmu.intel_cap.pebs_trap)
+ return 1;
+
+ /*
+ * No LBR entry, no basic block, no rewinding
+ */
+ if (!cpuc->lbr_stack.nr || !from || !to)
+ return 0;
+
+ /*
+ * Basic blocks should never cross user/kernel boundaries
+ */
+ if (kernel_ip(ip) != kernel_ip(to))
+ return 0;
+
+ /*
+ * unsigned math, either ip is before the start (impossible) or
+ * the basic block is larger than 1 page (sanity)
+ */
+ if ((ip - to) > PAGE_SIZE)
+ return 0;
+
+ /*
+ * We sampled a branch insn, rewind using the LBR stack
+ */
+ if (ip == to) {
+ regs->ip = from;
+ return 1;
+ }
+
+ do {
+ struct insn insn;
+ u8 buf[MAX_INSN_SIZE];
+ void *kaddr;
+
+ old_to = to;
+ if (!kernel_ip(ip)) {
+ int bytes, size = MAX_INSN_SIZE;
+
+ bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+ if (bytes != size)
+ return 0;
+
+ kaddr = buf;
+ } else
+ kaddr = (void *)to;
+
+ kernel_insn_init(&insn, kaddr);
+ insn_get_length(&insn);
+ to += insn.length;
+ } while (to < ip);
+
+ if (to == ip) {
+ regs->ip = old_to;
+ return 1;
+ }
+
+ /*
+ * Even though we decoded the basic block, the instruction stream
+ * never matched the given IP, either the TO or the IP got corrupted.
+ */
+ return 0;
+}
+
+static int intel_pmu_save_and_restart(struct perf_event *event);
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs)
+{
+ /*
+ * We cast to pebs_record_core since that is a subset of
+ * both formats and we don't use the other fields in this
+ * routine.
+ */
+ struct pebs_record_core *pebs = __pebs;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!intel_pmu_save_and_restart(event))
+ return;
+
+ perf_sample_data_init(&data, 0);
+ data.period = event->hw.last_period;
+
+ /*
+ * We use the interrupt regs as a base because the PEBS record
+ * does not contain a full regs set, specifically it seems to
+ * lack segment descriptors, which get used by things like
+ * user_mode().
+ *
+ * In the simple case fix up only the IP and BP,SP regs, for
+ * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+ * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+ */
+ regs = *iregs;
+ regs.ip = pebs->ip;
+ regs.bp = pebs->bp;
+ regs.sp = pebs->sp;
+
+ if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
+ regs.flags |= PERF_EFLAGS_EXACT;
+ else
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+
+ if (perf_event_overflow(event, 1, &data, &regs))
+ x86_pmu_stop(event);
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+ struct pebs_record_core *at, *top;
+ int n;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+ at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+ /*
+ * Whatever else happens, drain the thing
+ */
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ if (!test_bit(0, cpuc->active_mask))
+ return;
+
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ return;
+
+ n = top - at;
+ if (n <= 0)
+ return;
+
+ /*
+ * Should not happen, we program the threshold at 1 and do not
+ * set a reset value.
+ */
+ WARN_ON_ONCE(n > 1);
+ at += n - 1;
+
+ __intel_pmu_pebs_event(event, iregs, at);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct pebs_record_nhm *at, *top;
+ struct perf_event *event = NULL;
+ u64 status = 0;
+ int bit, n;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+ at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ n = top - at;
+ if (n <= 0)
+ return;
+
+ /*
+ * Should not happen, we program the threshold at 1 and do not
+ * set a reset value.
+ */
+ WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+
+ for ( ; at < top; at++) {
+ for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+ event = cpuc->events[bit];
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ continue;
+
+ if (__test_and_set_bit(bit, (unsigned long *)&status))
+ continue;
+
+ break;
+ }
+
+ if (!event || bit >= MAX_PEBS_EVENTS)
+ continue;
+
+ __intel_pmu_pebs_event(event, iregs, at);
+ }
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+static void intel_ds_init(void)
+{
+ /*
+ * No support for 32bit formats
+ */
+ if (!boot_cpu_has(X86_FEATURE_DTES64))
+ return;
+
+ x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
+ x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ if (x86_pmu.pebs) {
+ char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
+ int format = x86_pmu.intel_cap.pebs_format;
+
+ switch (format) {
+ case 0:
+ printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+ x86_pmu.pebs_constraints = intel_core_pebs_events;
+ break;
+
+ case 1:
+ printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+ break;
+
+ default:
+ printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
+ x86_pmu.pebs = 0;
+ break;
+ }
+ }
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int reserve_ds_buffers(void)
+{
+ return 0;
+}
+
+static void release_ds_buffers(void)
+{
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+enum {
+ LBR_FORMAT_32 = 0x00,
+ LBR_FORMAT_LIP = 0x01,
+ LBR_FORMAT_EIP = 0x02,
+ LBR_FORMAT_EIP_FLAGS = 0x03,
+};
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++)
+ wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ wrmsrl(x86_pmu.lbr_from + i, 0);
+ wrmsrl(x86_pmu.lbr_to + i, 0);
+ }
+}
+
+static void intel_pmu_lbr_reset(void)
+{
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+ intel_pmu_lbr_reset_32();
+ else
+ intel_pmu_lbr_reset_64();
+}
+
+static void intel_pmu_lbr_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ WARN_ON_ONCE(cpuc->enabled);
+
+ /*
+ * Reset the LBR stack if we changed task context to
+ * avoid data leaks.
+ */
+
+ if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = event->ctx;
+ }
+
+ cpuc->lbr_users++;
+}
+
+static void intel_pmu_lbr_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+
+ if (cpuc->enabled && !cpuc->lbr_users)
+ __intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->lbr_users)
+ __intel_pmu_lbr_enable();
+}
+
+static void intel_pmu_lbr_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->lbr_users)
+ __intel_pmu_lbr_disable();
+}
+
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+
+ return tos;
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ union {
+ struct {
+ u32 from;
+ u32 to;
+ };
+ u64 lbr;
+ } msr_lastbranch;
+
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+ cpuc->lbr_entries[i].from = msr_lastbranch.from;
+ cpuc->lbr_entries[i].to = msr_lastbranch.to;
+ cpuc->lbr_entries[i].flags = 0;
+ }
+ cpuc->lbr_stack.nr = i;
+}
+
+#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ int lbr_format = x86_pmu.intel_cap.lbr_format;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ u64 from, to, flags = 0;
+
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+
+ if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
+ flags = !!(from & LBR_FROM_FLAG_MISPRED);
+ from = (u64)((((s64)from) << 1) >> 1);
+ }
+
+ cpuc->lbr_entries[i].from = from;
+ cpuc->lbr_entries[i].to = to;
+ cpuc->lbr_entries[i].flags = flags;
+ }
+ cpuc->lbr_stack.nr = i;
+}
+
+static void intel_pmu_lbr_read(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!cpuc->lbr_users)
+ return;
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+ intel_pmu_lbr_read_32(cpuc);
+ else
+ intel_pmu_lbr_read_64(cpuc);
+}
+
+static void intel_pmu_lbr_init_core(void)
+{
+ x86_pmu.lbr_nr = 4;
+ x86_pmu.lbr_tos = 0x01c9;
+ x86_pmu.lbr_from = 0x40;
+ x86_pmu.lbr_to = 0x60;
+}
+
+static void intel_pmu_lbr_init_nhm(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = 0x01c9;
+ x86_pmu.lbr_from = 0x680;
+ x86_pmu.lbr_to = 0x6c0;
+}
+
+static void intel_pmu_lbr_init_atom(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = 0x01c9;
+ x86_pmu.lbr_from = 0x40;
+ x86_pmu.lbr_to = 0x60;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..424fc8de68e4
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,857 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ *
+ * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+#include <asm/perf_event_p4.h>
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+ unsigned int opcode; /* Event code and ESCR selector */
+ unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
+};
+
+struct p4_cache_event_bind {
+ unsigned int metric_pebs;
+ unsigned int metric_vert;
+};
+
+#define P4_GEN_CACHE_EVENT_BIND(name) \
+ [P4_CACHE__##name] = { \
+ .metric_pebs = P4_PEBS__##name, \
+ .metric_vert = P4_VERT__##name, \
+ }
+
+static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
+ P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
+ P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
+ P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
+ P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+ [P4_EVENT_TC_DELIVER_MODE] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_BPU_FETCH_REQUEST] = {
+ .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+ .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_ITLB_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_MEMORY_CANCEL] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MEMORY_COMPLETE] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_LOAD_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_STORE_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MOB_LOAD_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+ .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_PAGE_WALK_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+ .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ALLOCATION] = {
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_FSB_DATA_ACTIVITY] = {
+ .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .cntr = { {0, -1, -1}, {1, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_SSE_INPUT_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_64BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_128BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_X87_FP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_TC_MISC] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+ .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_TC_MS_XFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_UOP_QUEUE_WRITES] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RESOURCE_STALL] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+ .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_WC_BUFFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_B2B_CYCLES] = {
+ .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BNR] = {
+ .opcode = P4_OPCODE(P4_EVENT_BNR),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_SNOOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SNOOP),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_RESPONSE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_FRONT_END_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_EXECUTION_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_REPLAY_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOPS_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOP_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
+ .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_X87_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MACHINE_CLEAR] = {
+ .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_COMPLETED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
+ p4_config_pack_escr(P4_ESCR_EVENT(event) | \
+ P4_ESCR_EMASK_BIT(event, bit)) | \
+ p4_config_pack_cccr(cache_event | \
+ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_CACHE__1stl_cache_load_miss_retired),
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_CACHE__2ndl_cache_load_miss_retired),
+ },
+},
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_CACHE__dtlb_load_miss_retired),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_CACHE__dtlb_store_miss_retired),
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+ P4_CACHE__itlb_reference_hit),
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+ P4_CACHE__itlb_reference_miss),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+ /* non-halted CPU clocks */
+ [PERF_COUNT_HW_CPU_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+
+ /*
+ * retired instructions
+ * in a sake of simplicity we don't use the FSB tagging
+ */
+ [PERF_COUNT_HW_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+ /* cache hits */
+ [PERF_COUNT_HW_CACHE_REFERENCES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+ /* cache misses */
+ [PERF_COUNT_HW_CACHE_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+ /* branch instructions retired */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+ /* mispredicted branches retired */
+ [PERF_COUNT_HW_BRANCH_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+ /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
+ [PERF_COUNT_HW_BUS_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
+ p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+ unsigned int evnt = p4_config_unpack_event(config);
+ struct p4_event_bind *bind = NULL;
+
+ if (evnt < ARRAY_SIZE(p4_event_bind_map))
+ bind = &p4_event_bind_map[evnt];
+
+ return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+ struct p4_event_bind *bind;
+ unsigned int esel;
+ u64 config;
+
+ config = p4_general_events[hw_event];
+ bind = p4_config_get_bind(config);
+ esel = P4_OPCODE_ESEL(bind->opcode);
+ config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+ return config;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+ int cpu = get_cpu();
+ int rc = 0;
+ unsigned int evnt;
+ u32 escr, cccr;
+
+ /*
+ * the reason we use cpu that early is that: if we get scheduled
+ * first time on the same cpu -- we will not need swap thread
+ * specific flags in config (and will save some cpu cycles)
+ */
+
+ cccr = p4_default_cccr_conf(cpu);
+ escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+ event->attr.exclude_user);
+ event->hw.config = p4_config_pack_escr(escr) |
+ p4_config_pack_cccr(cccr);
+
+ if (p4_ht_active() && p4_ht_thread(cpu))
+ event->hw.config = p4_set_ht_bit(event->hw.config);
+
+ if (event->attr.type == PERF_TYPE_RAW) {
+
+ /* user data may have out-of-bound event index */
+ evnt = p4_config_unpack_event(event->attr.config);
+ if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * We don't control raw events so it's up to the caller
+ * to pass sane values (and we don't count the thread number
+ * on HT machine but allow HT-compatible specifics to be
+ * passed on)
+ *
+ * XXX: HT wide things should check perf_paranoid_cpu() &&
+ * CAP_SYS_ADMIN
+ */
+ event->hw.config |= event->attr.config &
+ (p4_config_pack_escr(P4_ESCR_MASK_HT) |
+ p4_config_pack_cccr(P4_CCCR_MASK_HT));
+ }
+
+ rc = x86_setup_perfctr(event);
+out:
+ put_cpu();
+ return rc;
+}
+
+static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+ unsigned long dummy;
+
+ rdmsrl(hwc->config_base + hwc->idx, dummy);
+ if (dummy & P4_CCCR_OVF) {
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ ((u64)dummy) & ~P4_CCCR_OVF);
+ }
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * If event gets disabled while counter is in overflowed
+ * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+ * asserted again and again
+ */
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ (u64)(p4_config_unpack_cccr(hwc->config)) &
+ ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_disable_event(event);
+ }
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int thread = p4_ht_config_thread(hwc->config);
+ u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+ unsigned int idx = p4_config_unpack_event(hwc->config);
+ unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
+ struct p4_event_bind *bind;
+ struct p4_cache_event_bind *bind_cache;
+ u64 escr_addr, cccr;
+
+ bind = &p4_event_bind_map[idx];
+ escr_addr = (u64)bind->escr_msr[thread];
+
+ /*
+ * - we dont support cascaded counters yet
+ * - and counter 1 is broken (erratum)
+ */
+ WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+ WARN_ON_ONCE(hwc->idx == 1);
+
+ /* we need a real Event value */
+ escr_conf &= ~P4_ESCR_EVENT_MASK;
+ escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ /*
+ * it could be Cache event so that we need to
+ * set metrics into additional MSRs
+ */
+ BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
+ if (idx_cache > P4_CACHE__NONE &&
+ idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
+ bind_cache = &p4_cache_event_bind_map[idx_cache];
+ (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
+ (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
+ }
+
+ (void)checking_wrmsrl(escr_addr, escr_conf);
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_enable_event(event);
+ }
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ data.addr = 0;
+ data.raw = NULL;
+
+ cpuc = &__get_cpu_var(cpu_hw_events);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+
+ WARN_ON_ONCE(hwc->idx != idx);
+
+ /*
+ * FIXME: Redundant call, actually not needed
+ * but just to check if we're screwed
+ */
+ p4_pmu_clear_cccr_ovf(hwc);
+
+ val = x86_perf_event_update(event);
+ if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+ continue;
+
+ /*
+ * event overflow
+ */
+ handled = 1;
+ data.period = event->hw.last_period;
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+ if (perf_event_overflow(event, 1, &data, regs))
+ p4_pmu_disable_event(event);
+ }
+
+ if (handled) {
+ /* p4 quirk: unmask it again */
+ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+ inc_irq_stat(apic_perf_irqs);
+ }
+
+ return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+ u32 escr, cccr;
+
+ /*
+ * we either lucky and continue on same cpu or no HT support
+ */
+ if (!p4_should_swap_ts(hwc->config, cpu))
+ return;
+
+ /*
+ * the event is migrated from an another logical
+ * cpu, so we need to swap thread specific flags
+ */
+
+ escr = p4_config_unpack_escr(hwc->config);
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ if (p4_ht_thread(cpu)) {
+ cccr &= ~P4_CCCR_OVF_PMI_T0;
+ cccr |= P4_CCCR_OVF_PMI_T1;
+ if (escr & P4_ESCR_T0_OS) {
+ escr &= ~P4_ESCR_T0_OS;
+ escr |= P4_ESCR_T1_OS;
+ }
+ if (escr & P4_ESCR_T0_USR) {
+ escr &= ~P4_ESCR_T0_USR;
+ escr |= P4_ESCR_T1_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config |= P4_CONFIG_HT;
+ } else {
+ cccr &= ~P4_CCCR_OVF_PMI_T1;
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ if (escr & P4_ESCR_T1_OS) {
+ escr &= ~P4_ESCR_T1_OS;
+ escr |= P4_ESCR_T0_OS;
+ }
+ if (escr & P4_ESCR_T1_USR) {
+ escr &= ~P4_ESCR_T1_USR;
+ escr |= P4_ESCR_T0_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config &= ~P4_CONFIG_HT;
+ }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE 0x000003a0
+#define P4_ESCR_MSR_MAX 0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+ unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+ if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+ !p4_escr_table[idx])) {
+ WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+ return -1;
+ }
+
+ return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+ struct p4_event_bind *bind)
+{
+ int i, j;
+
+ for (i = 0; i < P4_CNTR_LIMIT; i++) {
+ j = bind->cntr[thread][i];
+ if (j != -1 && !test_bit(j, used_mask))
+ return j;
+ }
+
+ return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+ int cpu = raw_smp_processor_id();
+ struct hw_perf_event *hwc;
+ struct p4_event_bind *bind;
+ unsigned int i, thread, num;
+ int cntr_idx, escr_idx;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+ for (i = 0, num = n; i < n; i++, num--) {
+
+ hwc = &cpuc->event_list[i]->hw;
+ thread = p4_ht_thread(cpu);
+ bind = p4_config_get_bind(hwc->config);
+ escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+ if (unlikely(escr_idx == -1))
+ goto done;
+
+ if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+ cntr_idx = hwc->idx;
+ if (assign)
+ assign[i] = hwc->idx;
+ goto reserve;
+ }
+
+ cntr_idx = p4_next_cntr(thread, used_mask, bind);
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
+ goto done;
+
+ p4_pmu_swap_config_ts(hwc, cpu);
+ if (assign)
+ assign[i] = cntr_idx;
+reserve:
+ set_bit(cntr_idx, used_mask);
+ set_bit(escr_idx, escr_mask);
+ }
+
+done:
+ return num ? -ENOSPC : 0;
+}
+
+static __initconst const struct x86_pmu p4_pmu = {
+ .name = "Netburst P4/Xeon",
+ .handle_irq = p4_pmu_handle_irq,
+ .disable_all = p4_pmu_disable_all,
+ .enable_all = p4_pmu_enable_all,
+ .enable = p4_pmu_enable_event,
+ .disable = p4_pmu_disable_event,
+ .eventsel = MSR_P4_BPU_CCCR0,
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .event_map = p4_pmu_event_map,
+ .max_events = ARRAY_SIZE(p4_general_events),
+ .get_event_constraints = x86_get_event_constraints,
+ /*
+ * IF HT disabled we may need to use all
+ * ARCH_P4_MAX_CCCR counters simulaneously
+ * though leave it restricted at moment assuming
+ * HT is on
+ */
+ .num_counters = ARCH_P4_MAX_CCCR,
+ .apic = 1,
+ .cntval_bits = 40,
+ .cntval_mask = (1ULL << 40) - 1,
+ .max_period = (1ULL << 39) - 1,
+ .hw_config = p4_hw_config,
+ .schedule_events = p4_pmu_schedule_events,
+};
+
+static __init int p4_pmu_init(void)
+{
+ unsigned int low, high;
+
+ /* If we get stripped -- indexig fails */
+ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+ if (!(low & (1 << 7))) {
+ pr_cont("unsupported Netburst CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Netburst events, ");
+
+ x86_pmu = p4_pmu;
+
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
*/
#define P6_NOP_EVENT 0x0000002EULL
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define P6_EVNTSEL_INV_MASK 0x00800000ULL
-#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define P6_EVNTSEL_MASK \
- (P6_EVNTSEL_EVENT_MASK | \
- P6_EVNTSEL_UNIT_MASK | \
- P6_EVNTSEL_EDGE_MASK | \
- P6_EVNTSEL_INV_MASK | \
- P6_EVNTSEL_REG_MASK)
-
- return hw_event & P6_EVNTSEL_MASK;
-}
-
static struct event_constraint p6_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
wrmsrl(MSR_P6_EVNTSEL0, val);
}
-static void p6_pmu_enable_all(void)
+static void p6_pmu_enable_all(int added)
{
unsigned long val;
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
}
-static __initconst struct x86_pmu p6_pmu = {
+static __initconst const struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = x86_pmu_handle_irq,
.disable_all = p6_pmu_disable_all,
.enable_all = p6_pmu_enable_all,
.enable = p6_pmu_enable_event,
.disable = p6_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_P6_EVNTSEL0,
.perfctr = MSR_P6_PERFCTR0,
.event_map = p6_pmu_event_map,
- .raw_event = p6_pmu_raw_event,
.max_events = ARRAY_SIZE(p6_perfmon_event_map),
.apic = 1,
.max_period = (1ULL << 31) - 1,
.version = 0,
- .num_events = 2,
+ .num_counters = 2,
/*
* Events have 40 bits implemented. However they are designed such
* that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
*
* See IA-32 Intel Architecture Software developer manual Vol 3B
*/
- .event_bits = 32,
- .event_mask = (1ULL << 32) - 1,
+ .cntval_bits = 32,
+ .cntval_mask = (1ULL << 32) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = p6_event_constraints,
};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..b9d1ff588445 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
#include <linux/dmi.h>
#include <linux/module.h>
#include <asm/div64.h>
-#include <asm/vmware.h>
#include <asm/x86_init.h>
+#include <asm/hypervisor.h>
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -65,7 +65,7 @@ static unsigned long vmware_get_tsc_khz(void)
return tsc_hz;
}
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -83,26 +83,22 @@ void __init vmware_platform_setup(void)
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
*/
-int vmware_platform(void)
+static bool __init vmware_platform(void)
{
if (cpu_has_hypervisor) {
- unsigned int eax, ebx, ecx, edx;
- char hyper_vendor_id[13];
-
- cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
- memcpy(hyper_vendor_id + 0, &ebx, 4);
- memcpy(hyper_vendor_id + 4, &ecx, 4);
- memcpy(hyper_vendor_id + 8, &edx, 4);
- hyper_vendor_id[12] = '\0';
- if (!strcmp(hyper_vendor_id, "VMwareVMware"))
- return 1;
+ unsigned int eax;
+ unsigned int hyper_vendor_id[3];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+ &hyper_vendor_id[1], &hyper_vendor_id[2]);
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+ return true;
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
- return 1;
+ return true;
- return 0;
+ return false;
}
-EXPORT_SYMBOL(vmware_platform);
/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +112,16 @@ EXPORT_SYMBOL(vmware_platform);
* so that the kernel could just trust the hypervisor with providing a
* reliable virtual TSC that is suitable for timekeeping.
*/
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
}
+
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .set_cpu_features = vmware_set_cpu_features,
+ .init_platform = vmware_platform_setup,
+};
+EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
- /* The name of the configuration: */
- const char *name;
-
- /* The size of pointer-typed fields in DS, BTS, and PEBS: */
- unsigned char sizeof_ptr_field;
-
- /* The size of a BTS/PEBS record in bytes: */
- unsigned char sizeof_rec[2];
-
- /* The number of pebs counter reset values in the DS structure. */
- unsigned char nr_counter_reset;
-
- /* Control bit-masks indexed by enum ds_feature: */
- unsigned long ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS 0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS (3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT (1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS 8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE 8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL \
- ( ds_cfg.ctl[dsf_bts] | \
- ds_cfg.ctl[dsf_bts_kernel] | \
- ds_cfg.ctl[dsf_bts_user] | \
- ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
- /* The DS context (partially) owned by this tracer. */
- struct ds_context *context;
- /* The buffer provided on ds_request() and its size in bytes. */
- void *buffer;
- size_t size;
-};
-
-struct bts_tracer {
- /* The common DS part: */
- struct ds_tracer ds;
-
- /* The trace including the DS configuration: */
- struct bts_trace trace;
-
- /* Buffer overflow notification function: */
- bts_ovfl_callback_t ovfl;
-
- /* Active flags affecting trace collection. */
- unsigned int flags;
-};
-
-struct pebs_tracer {
- /* The common DS part: */
- struct ds_tracer ds;
-
- /* The trace including the DS configuration: */
- struct pebs_trace trace;
-
- /* Buffer overflow notification function: */
- pebs_ovfl_callback_t ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- * the buffer)
- * - interrupt pointer into BTS buffer
- * (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- * the buffer)
- * - interrupt pointer into PEBS buffer
- * (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
- ds_buffer_base = 0,
- ds_index,
- ds_absolute_maximum,
- ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
- ds_bts = 0,
- ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
- base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
- return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
- unsigned long value)
-{
- base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
- (*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- * =0 no tracers
- * >0 number of per-thread tracers
- * <0 number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
- int error;
-
- spin_lock_irq(&ds_lock);
-
- if (task) {
- error = -EPERM;
- if (atomic_read(&tracers) < 0)
- goto out;
- atomic_inc(&tracers);
- } else {
- error = -EPERM;
- if (atomic_read(&tracers) > 0)
- goto out;
- atomic_dec(&tracers);
- }
-
- error = 0;
-out:
- spin_unlock_irq(&ds_lock);
- return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
- if (task)
- atomic_dec(&tracers);
- else
- atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- * attached context.
- * - in the latter case, we use a static array of per-cpu context
- * pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
- /* The DS configuration; goes into MSR_IA32_DS_AREA: */
- unsigned char ds[MAX_SIZEOF_DS];
-
- /* The owner of the BTS and PEBS configuration, respectively: */
- struct bts_tracer *bts_master;
- struct pebs_tracer *pebs_master;
-
- /* Use count: */
- unsigned long count;
-
- /* Pointer to the context pointer field: */
- struct ds_context **this;
-
- /* The traced task; NULL for cpu tracing: */
- struct task_struct *task;
-
- /* The traced cpu; only valid if task is NULL: */
- int cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
- struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
- struct ds_context *context = NULL;
- struct ds_context *new_context = NULL;
-
- /* Chances are small that we already have a context. */
- new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
- if (!new_context)
- return NULL;
-
- spin_lock_irq(&ds_lock);
-
- context = *p_context;
- if (likely(!context)) {
- context = new_context;
-
- context->this = p_context;
- context->task = task;
- context->cpu = cpu;
- context->count = 0;
-
- *p_context = context;
- }
-
- context->count++;
-
- spin_unlock_irq(&ds_lock);
-
- if (context != new_context)
- kfree(new_context);
-
- return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
- struct task_struct *task;
- unsigned long irq;
-
- if (!context)
- return;
-
- spin_lock_irqsave(&ds_lock, irq);
-
- if (--context->count) {
- spin_unlock_irqrestore(&ds_lock, irq);
- return;
- }
-
- *(context->this) = NULL;
-
- task = context->task;
-
- if (task)
- clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- /*
- * We leave the (now dangling) pointer to the DS configuration in
- * the DS_AREA msr. This is as good or as bad as replacing it with
- * NULL - the hardware would crash if we enabled tracing.
- *
- * This saves us some problems with having to write an msr on a
- * different cpu while preventing others from doing the same for the
- * next context for that same cpu.
- */
-
- spin_unlock_irqrestore(&ds_lock, irq);
-
- /* The context might still be in use for context switching. */
- if (task && (task != current))
- wait_task_context_switch(task);
-
- kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
- unsigned long ds;
-
- ds = (unsigned long)context->ds;
-
- /*
- * There is a race between the bts master and the pebs master.
- *
- * The thread/cpu access is synchronized via get/put_cpu() for
- * task tracing and via wrmsr_on_cpu for cpu tracing.
- *
- * If bts and pebs are collected for the same task or same cpu,
- * the same confiuration is written twice.
- */
- if (context->task) {
- get_cpu();
- if (context->task == current)
- wrmsrl(MSR_IA32_DS_AREA, ds);
- set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
- put_cpu();
- } else
- wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
- (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
- switch (qual) {
- case ds_bts:
- if (context->bts_master &&
- context->bts_master->ovfl)
- context->bts_master->ovfl(context->bts_master);
- break;
- case ds_pebs:
- if (context->pebs_master &&
- context->pebs_master->ovfl)
- context->pebs_master->ovfl(context->pebs_master);
- break;
- }
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
- const void *record, size_t size)
-{
- int bytes_written = 0;
-
- if (!record)
- return -EINVAL;
-
- while (size) {
- unsigned long base, index, end, write_end, int_th;
- unsigned long write_size, adj_write_size;
-
- /*
- * Write as much as possible without producing an
- * overflow interrupt.
- *
- * Interrupt_threshold must either be
- * - bigger than absolute_maximum or
- * - point to a record between buffer_base and absolute_maximum
- *
- * Index points to a valid record.
- */
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
- int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
- write_end = min(end, int_th);
-
- /*
- * If we are already beyond the interrupt threshold,
- * we fill the entire buffer.
- */
- if (write_end <= index)
- write_end = end;
-
- if (write_end <= index)
- break;
-
- write_size = min((unsigned long) size, write_end - index);
- memcpy((void *)index, record, write_size);
-
- record = (const char *)record + write_size;
- size -= write_size;
- bytes_written += write_size;
-
- adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
- adj_write_size *= ds_cfg.sizeof_rec[qual];
-
- /* Zero out trailing bytes. */
- memset((char *)index + write_size, 0,
- adj_write_size - write_size);
- index += adj_write_size;
-
- if (index >= end)
- index = base;
- ds_set(context->ds, qual, ds_index, index);
-
- if (index >= int_th)
- ds_overflow(context, qual);
- }
-
- return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
- bts_from,
- bts_to,
- bts_flags,
-
- bts_qual = bts_from,
- bts_clock = bts_to,
- bts_pid = bts_flags,
-
- bts_qual_mask = (bts_qual_max - 1),
- bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
- base += (ds_cfg.sizeof_ptr_field * field);
- return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
- base += (ds_cfg.sizeof_ptr_field * field);
- (*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- * writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
- if (!tracer)
- return -EINVAL;
-
- if (at < tracer->trace.ds.begin)
- return -EINVAL;
-
- if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
- return -EINVAL;
-
- memset(out, 0, sizeof(*out));
- if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
- out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
- out->variant.event.clock = bts_get(at, bts_clock);
- out->variant.event.pid = bts_get(at, bts_pid);
- } else {
- out->qualifier = bts_branch;
- out->variant.lbr.from = bts_get(at, bts_from);
- out->variant.lbr.to = bts_get(at, bts_to);
-
- if (!out->variant.lbr.from && !out->variant.lbr.to)
- out->qualifier = bts_invalid;
- }
-
- return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
- unsigned char raw[MAX_SIZEOF_BTS];
-
- if (!tracer)
- return -EINVAL;
-
- if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
- return -EOVERFLOW;
-
- switch (in->qualifier) {
- case bts_invalid:
- bts_set(raw, bts_from, 0);
- bts_set(raw, bts_to, 0);
- bts_set(raw, bts_flags, 0);
- break;
- case bts_branch:
- bts_set(raw, bts_from, in->variant.lbr.from);
- bts_set(raw, bts_to, in->variant.lbr.to);
- bts_set(raw, bts_flags, 0);
- break;
- case bts_task_arrives:
- case bts_task_departs:
- bts_set(raw, bts_qual, (bts_escape | in->qualifier));
- bts_set(raw, bts_clock, in->variant.event.clock);
- bts_set(raw, bts_pid, in->variant.event.pid);
- break;
- default:
- return -EINVAL;
- }
-
- return ds_write(tracer->ds.context, ds_bts, raw,
- ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
- struct ds_trace *cfg, enum ds_qualifier qual)
-{
- unsigned char *ds = context->ds;
-
- ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
- ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
- ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
- ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
- struct ds_trace *cfg, enum ds_qualifier qual)
-{
- unsigned char *ds = context->ds;
-
- cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
- cfg->top = (void *)ds_get(ds, qual, ds_index);
- cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
- cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
- void *base, size_t size, size_t ith,
- unsigned int flags) {
- unsigned long buffer, adj;
-
- /*
- * Adjust the buffer address and size to meet alignment
- * constraints:
- * - buffer is double-word aligned
- * - size is multiple of record size
- *
- * We checked the size at the very beginning; we have enough
- * space to do the adjustment.
- */
- buffer = (unsigned long)base;
-
- adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
- buffer += adj;
- size -= adj;
-
- trace->n = size / ds_cfg.sizeof_rec[qual];
- trace->size = ds_cfg.sizeof_rec[qual];
-
- size = (trace->n * trace->size);
-
- trace->begin = (void *)buffer;
- trace->top = trace->begin;
- trace->end = (void *)(buffer + size);
- /*
- * The value for 'no threshold' is -1, which will set the
- * threshold outside of the buffer, just like we want it.
- */
- ith *= ds_cfg.sizeof_rec[qual];
- trace->ith = (void *)(buffer + size - ith);
-
- trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
- enum ds_qualifier qual, struct task_struct *task,
- int cpu, void *base, size_t size, size_t th)
-{
- struct ds_context *context;
- int error;
- size_t req_size;
-
- error = -EOPNOTSUPP;
- if (!ds_cfg.sizeof_rec[qual])
- goto out;
-
- error = -EINVAL;
- if (!base)
- goto out;
-
- req_size = ds_cfg.sizeof_rec[qual];
- /* We might need space for alignment adjustments. */
- if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
- req_size += DS_ALIGNMENT;
-
- error = -EINVAL;
- if (size < req_size)
- goto out;
-
- if (th != (size_t)-1) {
- th *= ds_cfg.sizeof_rec[qual];
-
- error = -EINVAL;
- if (size <= th)
- goto out;
- }
-
- tracer->buffer = base;
- tracer->size = size;
-
- error = -ENOMEM;
- context = ds_get_context(task, cpu);
- if (!context)
- goto out;
- tracer->context = context;
-
- /*
- * Defer any tracer-specific initialization work for the context until
- * context ownership has been clarified.
- */
-
- error = 0;
- out:
- return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Buffer overflow notification is not yet implemented. */
- error = -EOPNOTSUPP;
- if (ovfl)
- goto out;
-
- error = get_tracer(task);
- if (error < 0)
- goto out;
-
- error = -ENOMEM;
- tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
- if (!tracer)
- goto out_put_tracer;
- tracer->ovfl = ovfl;
-
- /* Do some more error checking and acquire a tracing context. */
- error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_bts, task, cpu, base, size, th);
- if (error < 0)
- goto out_tracer;
-
- /* Claim the bts part of the tracing context we acquired above. */
- spin_lock_irq(&ds_lock);
-
- error = -EPERM;
- if (tracer->ds.context->bts_master)
- goto out_unlock;
- tracer->ds.context->bts_master = tracer;
-
- spin_unlock_irq(&ds_lock);
-
- /*
- * Now that we own the bts part of the context, let's complete the
- * initialization for that part.
- */
- ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
- ds_install_ds_area(tracer->ds.context);
-
- tracer->trace.read = bts_read;
- tracer->trace.write = bts_write;
-
- /* Start tracing. */
- ds_resume_bts(tracer);
-
- return tracer;
-
- out_unlock:
- spin_unlock_irq(&ds_lock);
- ds_put_context(tracer->ds.context);
- out_tracer:
- kfree(tracer);
- out_put_tracer:
- put_tracer(task);
- out:
- return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
-{
- struct pebs_tracer *tracer;
- int error;
-
- /* Buffer overflow notification is not yet implemented. */
- error = -EOPNOTSUPP;
- if (ovfl)
- goto out;
-
- error = get_tracer(task);
- if (error < 0)
- goto out;
-
- error = -ENOMEM;
- tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
- if (!tracer)
- goto out_put_tracer;
- tracer->ovfl = ovfl;
-
- /* Do some more error checking and acquire a tracing context. */
- error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_pebs, task, cpu, base, size, th);
- if (error < 0)
- goto out_tracer;
-
- /* Claim the pebs part of the tracing context we acquired above. */
- spin_lock_irq(&ds_lock);
-
- error = -EPERM;
- if (tracer->ds.context->pebs_master)
- goto out_unlock;
- tracer->ds.context->pebs_master = tracer;
-
- spin_unlock_irq(&ds_lock);
-
- /*
- * Now that we own the pebs part of the context, let's complete the
- * initialization for that part.
- */
- ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
- ds_install_ds_area(tracer->ds.context);
-
- /* Start tracing. */
- ds_resume_pebs(tracer);
-
- return tracer;
-
- out_unlock:
- spin_unlock_irq(&ds_lock);
- ds_put_context(tracer->ds.context);
- out_tracer:
- kfree(tracer);
- out_put_tracer:
- put_tracer(task);
- out:
- return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
-
- task = tracer->ds.context->task;
-
- WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
- tracer->ds.context->bts_master = NULL;
-
- /* Make sure tracing stopped and the tracer is not in use. */
- if (task && (task != current))
- wait_task_context_switch(task);
-
- ds_put_context(tracer->ds.context);
- put_tracer(task);
-
- kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
- might_sleep();
-
- if (!tracer)
- return;
-
- ds_suspend_bts(tracer);
- ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long irq;
- int error;
-
- if (!tracer)
- return 0;
-
- task = tracer->ds.context->task;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task &&
- (tracer->ds.context->cpu != smp_processor_id()))
- goto out;
-
- error = -EPERM;
- if (task && (task != current))
- goto out;
-
- ds_suspend_bts_noirq(tracer);
- ds_free_bts(tracer);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
- unsigned long debugctlmsr)
-{
- task->thread.debugctlmsr = debugctlmsr;
-
- get_cpu();
- if (task == current)
- update_debugctlmsr(debugctlmsr);
- put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr;
- int cpu;
-
- if (!tracer)
- return;
-
- tracer->flags = 0;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- WARN_ON(!task && irqs_disabled());
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr_on_cpu(cpu));
- debugctlmsr &= ~BTS_CONTROL;
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr, irq;
- int cpu, error = 0;
-
- if (!tracer)
- return 0;
-
- tracer->flags = 0;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task && (cpu != smp_processor_id()))
- goto out;
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr());
- debugctlmsr &= ~BTS_CONTROL;
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr(debugctlmsr);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
- unsigned long control;
-
- control = ds_cfg.ctl[dsf_bts];
- if (!(tracer->trace.ds.flags & BTS_KERNEL))
- control |= ds_cfg.ctl[dsf_bts_kernel];
- if (!(tracer->trace.ds.flags & BTS_USER))
- control |= ds_cfg.ctl[dsf_bts_user];
-
- return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr;
- int cpu;
-
- if (!tracer)
- return;
-
- tracer->flags = tracer->trace.ds.flags;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- WARN_ON(!task && irqs_disabled());
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr_on_cpu(cpu));
- debugctlmsr |= ds_bts_control(tracer);
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr, irq;
- int cpu, error = 0;
-
- if (!tracer)
- return 0;
-
- tracer->flags = tracer->trace.ds.flags;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task && (cpu != smp_processor_id()))
- goto out;
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr());
- debugctlmsr |= ds_bts_control(tracer);
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr(debugctlmsr);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
- struct task_struct *task;
-
- task = tracer->ds.context->task;
-
- WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
- tracer->ds.context->pebs_master = NULL;
-
- ds_put_context(tracer->ds.context);
- put_tracer(task);
-
- kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
- might_sleep();
-
- if (!tracer)
- return;
-
- ds_suspend_pebs(tracer);
- ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long irq;
- int error;
-
- if (!tracer)
- return 0;
-
- task = tracer->ds.context->task;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task &&
- (tracer->ds.context->cpu != smp_processor_id()))
- goto out;
-
- error = -EPERM;
- if (task && (task != current))
- goto out;
-
- ds_suspend_pebs_noirq(tracer);
- ds_free_pebs(tracer);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
- return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
- return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
- if (!tracer)
- return NULL;
-
- ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
- return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
- if (!tracer)
- return NULL;
-
- ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
- tracer->trace.counters = ds_cfg.nr_counter_reset;
- memcpy(tracer->trace.counter_reset,
- tracer->ds.context->ds +
- (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
- ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
- return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
- if (!tracer)
- return -EINVAL;
-
- tracer->trace.ds.top = tracer->trace.ds.begin;
-
- ds_set(tracer->ds.context->ds, ds_bts, ds_index,
- (unsigned long)tracer->trace.ds.top);
-
- return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
- if (!tracer)
- return -EINVAL;
-
- tracer->trace.ds.top = tracer->trace.ds.begin;
-
- ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
- (unsigned long)tracer->trace.ds.top);
-
- return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
- unsigned int counter, u64 value)
-{
- if (!tracer)
- return -EINVAL;
-
- if (ds_cfg.nr_counter_reset < counter)
- return -EINVAL;
-
- *(u64 *)(tracer->ds.context->ds +
- (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
- (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
- return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
- .name = "Netburst",
- .ctl[dsf_bts] = (1 << 2) | (1 << 3),
- .ctl[dsf_bts_kernel] = (1 << 5),
- .ctl[dsf_bts_user] = (1 << 6),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
- .name = "Pentium M",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
- .name = "Core 2/Atom",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .ctl[dsf_bts_kernel] = (1 << 9),
- .ctl[dsf_bts_user] = (1 << 10),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
- .name = "Core i7",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .ctl[dsf_bts_kernel] = (1 << 9),
- .ctl[dsf_bts_user] = (1 << 10),
- .nr_counter_reset = 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
- struct cpuinfo_x86 *cpu)
-{
- unsigned long nr_pebs_fields = 0;
-
- printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
- nr_pebs_fields = 10;
-#else
- nr_pebs_fields = 18;
-#endif
-
- /*
- * Starting with version 2, architectural performance
- * monitoring supports a format specifier.
- */
- if ((cpuid_eax(0xa) & 0xff) > 1) {
- unsigned long perf_capabilities, format;
-
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
- format = (perf_capabilities >> 8) & 0xf;
-
- switch (format) {
- case 0:
- nr_pebs_fields = 18;
- break;
- case 1:
- nr_pebs_fields = 22;
- break;
- default:
- printk(KERN_INFO
- "[ds] unknown PEBS format: %lu\n", format);
- nr_pebs_fields = 0;
- break;
- }
- }
-
- memset(&ds_cfg, 0, sizeof(ds_cfg));
- ds_cfg = *cfg;
-
- ds_cfg.sizeof_ptr_field =
- (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
- ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
- ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
- if (!cpu_has(cpu, X86_FEATURE_BTS)) {
- ds_cfg.sizeof_rec[ds_bts] = 0;
- printk(KERN_INFO "[ds] bts not available\n");
- }
- if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
- ds_cfg.sizeof_rec[ds_pebs] = 0;
- printk(KERN_INFO "[ds] pebs not available\n");
- }
-
- printk(KERN_INFO "[ds] sizes: address: %u bit, ",
- 8 * ds_cfg.sizeof_ptr_field);
- printk("bts/pebs record: %u/%u bytes\n",
- ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
- WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
- /* Only configure the first cpu. Others are identical. */
- if (ds_cfg.name)
- return;
-
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0x9:
- case 0xd: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m, c);
- break;
- case 0xf:
- case 0x17: /* Core2 */
- case 0x1c: /* Atom */
- ds_configure(&ds_cfg_core2_atom, c);
- break;
- case 0x1a: /* Core i7 */
- ds_configure(&ds_cfg_core_i7, c);
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
- break;
- case 0xf:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst, c);
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
- enum bts_qualifier qualifier,
- struct task_struct *task)
-{
- struct bts_tracer *tracer = context->bts_master;
- struct bts_struct ts;
-
- /* Prevent compilers from reading the tracer pointer twice. */
- barrier();
-
- if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
- return;
-
- memset(&ts, 0, sizeof(ts));
- ts.qualifier = qualifier;
- ts.variant.event.clock = trace_clock_global();
- ts.variant.event.pid = task->pid;
-
- bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
- struct ds_context *prev_ctx = prev->thread.ds_ctx;
- struct ds_context *next_ctx = next->thread.ds_ctx;
- unsigned long debugctlmsr = next->thread.debugctlmsr;
-
- /* Make sure all data is read before we start. */
- barrier();
-
- if (prev_ctx) {
- update_debugctlmsr(0);
-
- ds_take_timestamp(prev_ctx, bts_task_departs, prev);
- }
-
- if (next_ctx) {
- ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
- }
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
- if (ds_cfg.sizeof_rec[ds_bts]) {
- int error;
-
- error = ds_selftest_bts();
- if (error) {
- WARN(1, "[ds] selftest failed. disabling bts.\n");
- ds_cfg.sizeof_rec[ds_bts] = 0;
- }
- }
-
- if (ds_cfg.sizeof_rec[ds_pebs]) {
- int error;
-
- error = ds_selftest_pebs();
- if (error) {
- WARN(1, "[ds] selftest failed. disabling pebs.\n");
- ds_cfg.sizeof_rec[ds_pebs] = 0;
- }
- }
-
- return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
-
-struct ds_selftest_bts_conf {
- struct bts_tracer *tracer;
- int error;
- int (*suspend)(struct bts_tracer *);
- int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
- int error = 0;
-
- if (!trace) {
- printk(KERN_CONT "failed to access trace...");
- /* Bail out. Other tests are pointless. */
- return -1;
- }
-
- if (!trace->read) {
- printk(KERN_CONT "bts read not available...");
- error = -1;
- }
-
- /* Do some sanity checks on the trace configuration. */
- if (!trace->ds.n) {
- printk(KERN_CONT "empty bts buffer...");
- error = -1;
- }
- if (!trace->ds.size) {
- printk(KERN_CONT "bad bts trace setup...");
- error = -1;
- }
- if (trace->ds.end !=
- (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
- printk(KERN_CONT "bad bts buffer setup...");
- error = -1;
- }
- /*
- * We allow top in [begin; end], since its not clear when the
- * overflow adjustment happens: after the increment or before the
- * write.
- */
- if ((trace->ds.top < trace->ds.begin) ||
- (trace->ds.end < trace->ds.top)) {
- printk(KERN_CONT "bts top out of bounds...");
- error = -1;
- }
-
- return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
- const struct bts_trace *trace,
- const void *from, const void *to)
-{
- const unsigned char *at;
-
- /*
- * Check a few things which do not belong to this test.
- * They should be covered by other tests.
- */
- if (!trace)
- return -1;
-
- if (!trace->read)
- return -1;
-
- if (to < from)
- return -1;
-
- if (from < trace->ds.begin)
- return -1;
-
- if (trace->ds.end < to)
- return -1;
-
- if (!trace->ds.size)
- return -1;
-
- /* Now to the test itself. */
- for (at = from; (void *)at < to; at += trace->ds.size) {
- struct bts_struct bts;
- unsigned long index;
- int error;
-
- if (((void *)at - trace->ds.begin) % trace->ds.size) {
- printk(KERN_CONT
- "read from non-integer index...");
- return -1;
- }
- index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
- memset(&bts, 0, sizeof(bts));
- error = trace->read(tracer, at, &bts);
- if (error < 0) {
- printk(KERN_CONT
- "error reading bts trace at [%lu] (0x%p)...",
- index, at);
- return error;
- }
-
- switch (bts.qualifier) {
- case BTS_BRANCH:
- break;
- default:
- printk(KERN_CONT
- "unexpected bts entry %llu at [%lu] (0x%p)...",
- bts.qualifier, index, at);
- return -1;
- }
- }
-
- return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
- struct ds_selftest_bts_conf *conf = arg;
- const struct bts_trace *trace;
- void *top;
-
- if (IS_ERR(conf->tracer)) {
- conf->error = PTR_ERR(conf->tracer);
- conf->tracer = NULL;
-
- printk(KERN_CONT
- "initialization failed (err: %d)...", conf->error);
- return;
- }
-
- /* We should meanwhile have enough trace. */
- conf->error = conf->suspend(conf->tracer);
- if (conf->error < 0)
- return;
-
- /* Let's see if we can access the trace. */
- trace = ds_read_bts(conf->tracer);
-
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- /* If everything went well, we should have a few trace entries. */
- if (trace->ds.top == trace->ds.begin) {
- /*
- * It is possible but highly unlikely that we got a
- * buffer overflow and end up at exactly the same
- * position we started from.
- * Let's issue a warning, but continue.
- */
- printk(KERN_CONT "no trace/overflow...");
- }
-
- /* Let's try to read the trace we collected. */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.top);
- if (conf->error < 0)
- return;
-
- /*
- * Let's read the trace again.
- * Since we suspended tracing, we should get the same result.
- */
- top = trace->ds.top;
-
- trace = ds_read_bts(conf->tracer);
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- if (top != trace->ds.top) {
- printk(KERN_CONT "suspend not working...");
- conf->error = -1;
- return;
- }
-
- /* Let's collect some more trace - see if resume is working. */
- conf->error = conf->resume(conf->tracer);
- if (conf->error < 0)
- return;
-
- conf->error = conf->suspend(conf->tracer);
- if (conf->error < 0)
- return;
-
- trace = ds_read_bts(conf->tracer);
-
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- if (trace->ds.top == top) {
- /*
- * It is possible but highly unlikely that we got a
- * buffer overflow and end up at exactly the same
- * position we started from.
- * Let's issue a warning and check the full trace.
- */
- printk(KERN_CONT
- "no resume progress/overflow...");
-
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.end);
- } else if (trace->ds.top < top) {
- /*
- * We had a buffer overflow - the entire buffer should
- * contain trace records.
- */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.end);
- } else {
- /*
- * It is quite likely that the buffer did not overflow.
- * Let's just check the delta trace.
- */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace, top,
- trace->ds.top);
- }
- if (conf->error < 0)
- return;
-
- conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
- ds_suspend_bts(tracer);
- return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
- ds_resume_bts(tracer);
- return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
- (void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
- struct bts_tracer *tracer)
-{
- int error = -EPERM;
-
- /* Try to release the tracer on the wrong cpu. */
- get_cpu();
- if (cpu != smp_processor_id()) {
- error = ds_release_bts_noirq(tracer);
- if (error != -EPERM)
- printk(KERN_CONT "release on wrong cpu...");
- }
- put_cpu();
-
- return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Try to request cpu tracing while task tracing is active. */
- tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
- (size_t)-1, BTS_KERNEL);
- error = PTR_ERR(tracer);
- if (!IS_ERR(tracer)) {
- ds_release_bts(tracer);
- error = 0;
- }
-
- if (error != -EPERM)
- printk(KERN_CONT "cpu/task tracing overlap...");
-
- return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Try to request cpu tracing while task tracing is active. */
- tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
- (size_t)-1, BTS_KERNEL);
- error = PTR_ERR(tracer);
- if (!IS_ERR(tracer)) {
- error = 0;
- ds_release_bts(tracer);
- }
-
- if (error != -EPERM)
- printk(KERN_CONT "task/cpu tracing overlap...");
-
- return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
- struct ds_selftest_bts_conf conf;
- unsigned char buffer[BUFFER_SIZE], *small_buffer;
- unsigned long irq;
- int cpu;
-
- printk(KERN_INFO "[ds] bts selftest...");
- conf.error = 0;
-
- small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- conf.suspend = ds_suspend_bts_wrap;
- conf.resume = ds_resume_bts_wrap;
- conf.tracer =
- ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_task(buffer);
- ds_release_bts(conf.tracer);
- if (conf.error < 0)
- goto out;
-
- conf.suspend = ds_suspend_bts_noirq;
- conf.resume = ds_resume_bts_noirq;
- conf.tracer =
- ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
- if (conf.error >= 0) {
- conf.error =
- ds_selftest_bts_bad_release_noirq(cpu,
- conf.tracer);
- /* We must not release the tracer twice. */
- if (conf.error < 0)
- conf.tracer = NULL;
- }
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_task(buffer);
- smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
- conf.tracer, 1);
- if (conf.error < 0)
- goto out;
- }
-
- conf.suspend = ds_suspend_bts_wrap;
- conf.resume = ds_resume_bts_wrap;
- conf.tracer =
- ds_request_bts_task(current, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
- ds_release_bts(conf.tracer);
- if (conf.error < 0)
- goto out;
-
- conf.suspend = ds_suspend_bts_noirq;
- conf.resume = ds_resume_bts_noirq;
- conf.tracer =
- ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- local_irq_save(irq);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
- ds_release_bts_noirq(conf.tracer);
- local_irq_restore(irq);
- if (conf.error < 0)
- goto out;
-
- conf.error = 0;
- out:
- put_online_cpus();
- printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
- return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
- return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..c89a386930b7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
int cpu;
unsigned long flags;
- /* notify the hw-branch tracer so it may disable tracing and
- add the last trace to the trace buffer -
- the earlier this happens, the more useful the trace. */
- trace_hw_branch_oops();
-
oops_enter();
/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a8f1b803d2fd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
}
/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
- unsigned int len;
-
- len = get_hbp_len(hbp_len);
-
- return (va <= TASK_SIZE - len);
-}
-
-/*
* Check for virtual address in kernel space.
*/
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
{
unsigned int len;
+ unsigned long va;
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
- len = get_hbp_len(hbp_len);
+ va = info->address;
+ len = get_hbp_len(info->len);
return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
}
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
/*
* Validate the arch-specific HW Breakpoint register settings
*/
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
- struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
ret = -EINVAL;
- if (info->type == X86_BREAKPOINT_EXECUTE)
- /*
- * Ptrace-refactoring code
- * For now, we'll allow instruction breakpoint only for user-space
- * addresses
- */
- if ((!arch_check_va_in_userspace(info->address, info->len)) &&
- info->len != X86_BREAKPOINT_EXECUTE)
- return ret;
-
switch (info->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
if (info->address & align)
return -EINVAL;
- /* Check that the virtual address is in the proper range */
- if (tsk) {
- if (!arch_check_va_in_userspace(info->address, info->len))
- return -EFAULT;
- } else {
- if (!arch_check_va_in_kernelspace(info->address, info->len))
- return -EFAULT;
- }
-
return 0;
}
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
#include <asm/hpet.h>
#include <asm/smp.h>
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
/*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
static void init_pit_timer(enum clock_event_mode mode,
struct clock_event_device *evt)
{
- spin_lock(&i8253_lock);
+ raw_spin_lock(&i8253_lock);
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
/* Nothing to do here */
break;
}
- spin_unlock(&i8253_lock);
+ raw_spin_unlock(&i8253_lock);
}
/*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
*/
static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
{
- spin_lock(&i8253_lock);
+ raw_spin_lock(&i8253_lock);
outb_pit(delta & 0xff , PIT_CH0); /* LSB */
outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- spin_unlock(&i8253_lock);
+ raw_spin_unlock(&i8253_lock);
return 0;
}
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
int count;
u32 jifs;
- spin_lock_irqsave(&i8253_lock, flags);
+ raw_spin_lock_irqsave(&i8253_lock, flags);
/*
* Although our caller may have the read side of xtime_lock,
* this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
old_count = count;
old_jifs = jifs;
- spin_unlock_irqrestore(&i8253_lock, flags);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
count = (LATCH - 1) - count;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c0..345a4b1fe144 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
static void __kprobes clear_btf(void)
{
- if (test_thread_flag(TIF_DEBUGCTLMSR))
- update_debugctlmsr(0);
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
}
static void __kprobes restore_btf(void)
{
- if (test_thread_flag(TIF_DEBUGCTLMSR))
- update_debugctlmsr(current->thread.debugctlmsr);
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl |= DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
}
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
@@ -534,20 +542,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
struct kprobe_ctlblk *kcb;
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
- if (*addr != BREAKPOINT_INSTRUCTION) {
- /*
- * The breakpoint instruction was removed right
- * after we hit it. Another cpu has removed
- * either a probepoint or a debugger breakpoint
- * at this address. In either case, no further
- * handling of this interrupt is appropriate.
- * Back up over the (now missing) int3 and run
- * the original instruction.
- */
- regs->ip = (unsigned long)addr;
- return 1;
- }
-
/*
* We don't want to be preempted for the entire
* duration of kprobe processing. We conditionally
@@ -579,6 +573,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
setup_singlestep(p, regs, kcb, 0);
return 1;
}
+ } else if (*addr != BREAKPOINT_INSTRUCTION) {
+ /*
+ * The breakpoint instruction was removed right
+ * after we hit it. Another cpu has removed
+ * either a probepoint or a debugger breakpoint
+ * at this address. In either case, no further
+ * handling of this interrupt is appropriate.
+ * Back up over the (now missing) int3 and run
+ * the original instruction.
+ */
+ regs->ip = (unsigned long)addr;
+ preempt_enable_no_resched();
+ return 1;
} else if (kprobe_running()) {
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f18fd9c15247..e7e35219b32f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
-#include <asm/ds.h>
#include <asm/debugreg.h>
unsigned long idle_halt;
@@ -48,7 +47,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
void free_thread_xstate(struct task_struct *tsk)
{
fpu_free(&tsk->thread.fpu);
- WARN(tsk->thread.ds_ctx, "leaking DS context\n");
}
void free_thread_info(struct thread_info *ti)
@@ -195,11 +193,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
+ if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+ test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+ debugctl |= DEBUGCTLMSR_BTF;
+
+ update_debugctlmsr(debugctl);
+ }
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -543,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
* check OSVW bit for CPUs that are not affected
* by erratum #400
*/
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
- if (val >= 2) {
- rdmsrl(MSR_AMD64_OSVW_STATUS, val);
- if (!(val & BIT(1)))
- goto no_c1e_idle;
+ if (cpu_has(c, X86_FEATURE_OSVW)) {
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+ if (val >= 2) {
+ rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+ if (!(val & BIT(1)))
+ goto no_c1e_idle;
+ }
}
return 1;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a7a4f5bbaa9..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
#include <asm/cpu.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
-#include <asm/ds.h>
#include <asm/debugreg.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
-
- clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- p->thread.ds_ctx = NULL;
-
- clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- p->thread.debugctlmsr = 0;
-
return err;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 979215f51985..3c2422a99f1f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
#include <asm/ia32.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
-#include <asm/ds.h>
#include <asm/debugreg.h>
asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
if (err)
goto out;
}
-
- clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- p->thread.ds_ctx = NULL;
-
- clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- p->thread.debugctlmsr = 0;
-
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7e..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
*/
#include <linux/kernel.h>
@@ -22,7 +19,6 @@
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
-#include <linux/workqueue.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
@@ -36,7 +32,6 @@
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/proto.h>
-#include <asm/ds.h>
#include <asm/hw_breakpoint.h>
#include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
struct perf_event_attr attr;
if (!t->ptrace_bps[nr]) {
- hw_breakpoint_init(&attr);
+ ptrace_breakpoint_init(&attr);
/*
* Put stub len and type to register (reserve) an inactive but
* correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
0, IO_BITMAP_BYTES);
}
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
- /* The branch trace handle. */
- struct bts_tracer *tracer;
-
- /* The buffer used to store the branch trace and its size. */
- void *buffer;
- unsigned int size;
-
- /* The mm that paid for the above buffer. */
- struct mm_struct *mm;
-
- /* The task this context belongs to. */
- struct task_struct *task;
-
- /* The signal to send on a bts buffer overflow. */
- unsigned int bts_ovfl_signal;
-
- /* The work struct to destroy a context. */
- struct work_struct work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
- void *buffer = NULL;
- int err = -ENOMEM;
-
- err = account_locked_memory(current->mm, current->signal->rlim, size);
- if (err < 0)
- return err;
-
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- goto out_refund;
-
- context->buffer = buffer;
- context->size = size;
- context->mm = get_task_mm(current);
-
- return 0;
-
- out_refund:
- refund_locked_memory(current->mm, size);
- return err;
-}
-
-static inline void free_bts_buffer(struct bts_context *context)
-{
- if (!context->buffer)
- return;
-
- kfree(context->buffer);
- context->buffer = NULL;
-
- refund_locked_memory(context->mm, context->size);
- context->size = 0;
-
- mmput(context->mm);
- context->mm = NULL;
-}
-
-static void free_bts_context_work(struct work_struct *w)
-{
- struct bts_context *context;
-
- context = container_of(w, struct bts_context, work);
-
- ds_release_bts(context->tracer);
- put_task_struct(context->task);
- free_bts_buffer(context);
- kfree(context);
-}
-
-static inline void free_bts_context(struct bts_context *context)
-{
- INIT_WORK(&context->work, free_bts_context_work);
- schedule_work(&context->work);
-}
-
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
- struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (context) {
- context->task = task;
- task->bts = context;
-
- get_task_struct(task);
- }
-
- return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
- struct bts_struct __user *out)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- struct bts_struct bts;
- const unsigned char *at;
- int error;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- at = trace->ds.top - ((index + 1) * trace->ds.size);
- if ((void *)at < trace->ds.begin)
- at += (trace->ds.n * trace->ds.size);
-
- if (!trace->read)
- return -EOPNOTSUPP;
-
- error = trace->read(context->tracer, at, &bts);
- if (error < 0)
- return error;
-
- if (copy_to_user(out, &bts, sizeof(bts)))
- return -EFAULT;
-
- return sizeof(bts);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
- long size,
- struct bts_struct __user *out)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- const unsigned char *at;
- int error, drained = 0;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- if (!trace->read)
- return -EOPNOTSUPP;
-
- if (size < (trace->ds.top - trace->ds.begin))
- return -EIO;
-
- for (at = trace->ds.begin; (void *)at < trace->ds.top;
- out++, drained++, at += trace->ds.size) {
- struct bts_struct bts;
-
- error = trace->read(context->tracer, at, &bts);
- if (error < 0)
- return error;
-
- if (copy_to_user(out, &bts, sizeof(bts)))
- return -EFAULT;
- }
-
- memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
- error = ds_reset_bts(context->tracer);
- if (error < 0)
- return error;
-
- return drained;
-}
-
-static int ptrace_bts_config(struct task_struct *child,
- long cfg_size,
- const struct ptrace_bts_config __user *ucfg)
-{
- struct bts_context *context;
- struct ptrace_bts_config cfg;
- unsigned int flags = 0;
-
- if (cfg_size < sizeof(cfg))
- return -EIO;
-
- if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- return -EFAULT;
-
- context = child->bts;
- if (!context)
- context = alloc_bts_context(child);
- if (!context)
- return -ENOMEM;
-
- if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
- if (!cfg.signal)
- return -EINVAL;
-
- return -EOPNOTSUPP;
- context->bts_ovfl_signal = cfg.signal;
- }
-
- ds_release_bts(context->tracer);
- context->tracer = NULL;
-
- if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
- int err;
-
- free_bts_buffer(context);
- if (!cfg.size)
- return 0;
-
- err = alloc_bts_buffer(context, cfg.size);
- if (err < 0)
- return err;
- }
-
- if (cfg.flags & PTRACE_BTS_O_TRACE)
- flags |= BTS_USER;
-
- if (cfg.flags & PTRACE_BTS_O_SCHED)
- flags |= BTS_TIMESTAMPS;
-
- context->tracer =
- ds_request_bts_task(child, context->buffer, context->size,
- NULL, (size_t)-1, flags);
- if (unlikely(IS_ERR(context->tracer))) {
- int error = PTR_ERR(context->tracer);
-
- free_bts_buffer(context);
- context->tracer = NULL;
- return error;
- }
-
- return sizeof(cfg);
-}
-
-static int ptrace_bts_status(struct task_struct *child,
- long cfg_size,
- struct ptrace_bts_config __user *ucfg)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- struct ptrace_bts_config cfg;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- if (cfg_size < sizeof(cfg))
- return -EIO;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- memset(&cfg, 0, sizeof(cfg));
- cfg.size = trace->ds.end - trace->ds.begin;
- cfg.signal = context->bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
-
- if (cfg.signal)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
- if (trace->ds.flags & BTS_USER)
- cfg.flags |= PTRACE_BTS_O_TRACE;
-
- if (trace->ds.flags & BTS_TIMESTAMPS)
- cfg.flags |= PTRACE_BTS_O_SCHED;
-
- if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
- return -EFAULT;
-
- return sizeof(cfg);
-}
-
-static int ptrace_bts_clear(struct task_struct *child)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
- return ds_reset_bts(context->tracer);
-}
-
-static int ptrace_bts_size(struct task_struct *child)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
- if (unlikely(child->bts)) {
- free_bts_context(child->bts);
- child->bts = NULL;
- }
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
/*
* Called by kernel/ptrace.c when detaching..
*
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
#endif
- /*
- * These bits need more cooking - not enabled yet:
- */
-#ifdef CONFIG_X86_PTRACE_BTS
- case PTRACE_BTS_CONFIG:
- ret = ptrace_bts_config
- (child, data, (struct ptrace_bts_config __user *)addr);
- break;
-
- case PTRACE_BTS_STATUS:
- ret = ptrace_bts_status
- (child, data, (struct ptrace_bts_config __user *)addr);
- break;
-
- case PTRACE_BTS_SIZE:
- ret = ptrace_bts_size(child);
- break;
-
- case PTRACE_BTS_GET:
- ret = ptrace_bts_read_record
- (child, data, (struct bts_struct __user *) addr);
- break;
-
- case PTRACE_BTS_CLEAR:
- ret = ptrace_bts_clear(child);
- break;
-
- case PTRACE_BTS_DRAIN:
- ret = ptrace_bts_drain
- (child, data, (struct bts_struct __user *) addr);
- break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
default:
ret = ptrace_request(child, request, addr, data);
break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
- case PTRACE_BTS_CONFIG:
- case PTRACE_BTS_STATUS:
- case PTRACE_BTS_SIZE:
- case PTRACE_BTS_GET:
- case PTRACE_BTS_CLEAR:
- case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
return arch_ptrace(child, request, addr, data);
default:
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
}
/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
- if (child->thread.debugctlmsr == val)
- return;
-
- child->thread.debugctlmsr = val;
-
- if (child != current)
- return;
-
- update_debugctlmsr(val);
-}
-
-/*
* Enable single or block step.
*/
static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
* that uses user-mode single stepping itself.
*/
if (enable_single_step(child) && block) {
- set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- write_debugctlmsr(child,
- child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
- } else {
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl |= DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ set_tsk_thread_flag(child, TIF_BLOCKSTEP);
+ } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
}
}
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+ }
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 00516e1de55d..02cfb9b8f5b1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -534,11 +534,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
+
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
- clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
- tsk->thread.debugctlmsr = 0;
+ clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
#ifndef CONFIG_PARAVIRT
EXPORT_SYMBOL(native_load_gs_index);
#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812a..737361fcd503 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2067,7 +2067,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
}
@@ -2479,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+ svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -2539,10 +2539,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+ svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
}
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e66..edca080407a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2703,8 +2703,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
return 0;
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_NMI));
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3660,8 +3659,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
/* We need to handle NMIs before interrupts are enabled */
if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK))
+ (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
+ kvm_after_handle_nmi(&vmx->vcpu);
+ }
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27f..dd9bc8fb81ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -40,6 +40,7 @@
#include <linux/user-return-notifier.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/perf_event.h>
#include <trace/events/kvm.h>
#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
@@ -1712,6 +1713,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
+ vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1729,6 +1731,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
+ vcpu_put(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1749,9 +1752,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
+ vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
+ vcpu_put(vcpu);
return 0;
out:
@@ -3743,6 +3748,51 @@ static void kvm_timer_init(void)
}
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static int kvm_is_in_guest(void)
+{
+ return percpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+ int user_mode = 3;
+
+ if (percpu_read(current_vcpu))
+ user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+
+ return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+ unsigned long ip = 0;
+
+ if (percpu_read(current_vcpu))
+ ip = kvm_rip_read(percpu_read(current_vcpu));
+
+ return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ .is_in_guest = kvm_is_in_guest,
+ .is_user_mode = kvm_is_user_mode,
+ .get_guest_ip = kvm_get_guest_ip,
+};
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ percpu_write(current_vcpu, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ percpu_write(current_vcpu, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+
int kvm_arch_init(void *opaque)
{
int r;
@@ -3779,6 +3829,8 @@ int kvm_arch_init(void *opaque)
kvm_timer_init();
+ perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
return 0;
out:
@@ -3787,6 +3839,8 @@ out:
void kvm_arch_exit(void)
{
+ perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d101639bd8d..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,4 +65,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 419386c24b82..f871e04b6965 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -20,17 +20,18 @@ lib-y := delay.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
-lib-$(CONFIG_KPROBES) += insn.o inat.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
obj-y += msr.o msr-reg.o msr-reg-export.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
+ lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
- lib-y += cmpxchg8b_emu.o
+ lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a3..540179e8e9fa 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -6,225 +6,54 @@
#include <asm/cmpxchg.h>
#include <asm/atomic.h>
-static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
-{
- u32 low = new;
- u32 high = new >> 32;
-
- asm volatile(
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "+A" (old), "+m" (*ptr)
- : "b" (low), "c" (high)
- );
- return old;
-}
-
-u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
-{
- return cmpxchg8b(&ptr->counter, old_val, new_val);
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
-{
- /*
- * Try first with a (possibly incorrect) assumption about
- * what we have there. We'll do two loops most likely,
- * but we'll get an ownership MESI transaction straight away
- * instead of a read transaction followed by a
- * flush-for-ownership transaction:
- */
- u64 old_val, real_val = 0;
-
- do {
- old_val = real_val;
-
- real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
- } while (real_val != old_val);
-
- return old_val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-void atomic64_set(atomic64_t *ptr, u64 new_val)
-{
- atomic64_xchg(ptr, new_val);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-/**
-EXPORT_SYMBOL(atomic64_read);
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
-{
- /*
- * Try first with a (possibly incorrect) assumption about
- * what we have there. We'll do two loops most likely,
- * but we'll get an ownership MESI transaction straight away
- * instead of a read transaction followed by a
- * flush-for-ownership transaction:
- */
- u64 old_val, new_val, real_val = 0;
-
- do {
- old_val = real_val;
- new_val = old_val + delta;
-
- real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
- } while (real_val != old_val);
-
- return new_val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
-{
- return atomic64_add_return(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-u64 atomic64_inc_return(atomic64_t *ptr)
-{
- return atomic64_add_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_return);
-
-u64 atomic64_dec_return(atomic64_t *ptr)
-{
- return atomic64_sub_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_return);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-void atomic64_add(u64 delta, atomic64_t *ptr)
-{
- atomic64_add_return(delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-void atomic64_sub(u64 delta, atomic64_t *ptr)
-{
- atomic64_add(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
-{
- u64 new_val = atomic64_sub_return(delta, ptr);
-
- return new_val == 0;
-}
-EXPORT_SYMBOL(atomic64_sub_and_test);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-void atomic64_inc(atomic64_t *ptr)
-{
- atomic64_add(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-void atomic64_dec(atomic64_t *ptr)
-{
- atomic64_sub(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-int atomic64_dec_and_test(atomic64_t *ptr)
-{
- return atomic64_sub_and_test(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_and_test);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_inc_and_test(atomic64_t *ptr)
-{
- return atomic64_sub_and_test(-1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_and_test);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-int atomic64_add_negative(u64 delta, atomic64_t *ptr)
-{
- s64 new_val = atomic64_add_return(delta, ptr);
-
- return new_val < 0;
-}
-EXPORT_SYMBOL(atomic64_add_negative);
+long long atomic64_read_cx8(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_read_cx8);
+long long atomic64_set_cx8(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_set_cx8);
+long long atomic64_xchg_cx8(long long, unsigned high);
+EXPORT_SYMBOL(atomic64_xchg_cx8);
+long long atomic64_add_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_return_cx8);
+long long atomic64_sub_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_return_cx8);
+long long atomic64_inc_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_return_cx8);
+long long atomic64_dec_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_return_cx8);
+long long atomic64_dec_if_positive_cx8(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_if_positive_cx8);
+int atomic64_inc_not_zero_cx8(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_not_zero_cx8);
+int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u);
+EXPORT_SYMBOL(atomic64_add_unless_cx8);
+
+#ifndef CONFIG_X86_CMPXCHG64
+long long atomic64_read_386(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_read_386);
+long long atomic64_set_386(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_set_386);
+long long atomic64_xchg_386(long long, unsigned high);
+EXPORT_SYMBOL(atomic64_xchg_386);
+long long atomic64_add_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_return_386);
+long long atomic64_sub_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_return_386);
+long long atomic64_inc_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_return_386);
+long long atomic64_dec_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_return_386);
+long long atomic64_add_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_386);
+long long atomic64_sub_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_386);
+long long atomic64_inc_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_386);
+long long atomic64_dec_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_386);
+long long atomic64_dec_if_positive_386(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_if_positive_386);
+int atomic64_inc_not_zero_386(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_not_zero_386);
+int atomic64_add_unless_386(atomic64_t *v, long long a, long long u);
+EXPORT_SYMBOL(atomic64_add_unless_386);
+#endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 000000000000..4a5979aa6883
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,174 @@
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro LOCK reg
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ cli
+.endm
+
+.macro UNLOCK reg
+ popfl
+ CFI_ADJUST_CFA_OFFSET -4
+.endm
+
+.macro BEGIN func reg
+$v = \reg
+
+ENTRY(atomic64_\func\()_386)
+ CFI_STARTPROC
+ LOCK $v
+
+.macro RETURN
+ UNLOCK $v
+ ret
+.endm
+
+.macro END_
+ CFI_ENDPROC
+ENDPROC(atomic64_\func\()_386)
+.purgem RETURN
+.purgem END_
+.purgem END
+.endm
+
+.macro END
+RETURN
+END_
+.endm
+.endm
+
+BEGIN read %ecx
+ movl ($v), %eax
+ movl 4($v), %edx
+END
+
+BEGIN set %esi
+ movl %ebx, ($v)
+ movl %ecx, 4($v)
+END
+
+BEGIN xchg %esi
+ movl ($v), %eax
+ movl 4($v), %edx
+ movl %ebx, ($v)
+ movl %ecx, 4($v)
+END
+
+BEGIN add %ecx
+ addl %eax, ($v)
+ adcl %edx, 4($v)
+END
+
+BEGIN add_return %ecx
+ addl ($v), %eax
+ adcl 4($v), %edx
+ movl %eax, ($v)
+ movl %edx, 4($v)
+END
+
+BEGIN sub %ecx
+ subl %eax, ($v)
+ sbbl %edx, 4($v)
+END
+
+BEGIN sub_return %ecx
+ negl %edx
+ negl %eax
+ sbbl $0, %edx
+ addl ($v), %eax
+ adcl 4($v), %edx
+ movl %eax, ($v)
+ movl %edx, 4($v)
+END
+
+BEGIN inc %esi
+ addl $1, ($v)
+ adcl $0, 4($v)
+END
+
+BEGIN inc_return %esi
+ movl ($v), %eax
+ movl 4($v), %edx
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, ($v)
+ movl %edx, 4($v)
+END
+
+BEGIN dec %esi
+ subl $1, ($v)
+ sbbl $0, 4($v)
+END
+
+BEGIN dec_return %esi
+ movl ($v), %eax
+ movl 4($v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ movl %eax, ($v)
+ movl %edx, 4($v)
+END
+
+BEGIN add_unless %ecx
+ addl %eax, %esi
+ adcl %edx, %edi
+ addl ($v), %eax
+ adcl 4($v), %edx
+ cmpl %eax, %esi
+ je 3f
+1:
+ movl %eax, ($v)
+ movl %edx, 4($v)
+ movl $1, %eax
+2:
+RETURN
+3:
+ cmpl %edx, %edi
+ jne 1b
+ xorl %eax, %eax
+ jmp 2b
+END_
+
+BEGIN inc_not_zero %esi
+ movl ($v), %eax
+ movl 4($v), %edx
+ testl %eax, %eax
+ je 3f
+1:
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, ($v)
+ movl %edx, 4($v)
+ movl $1, %eax
+2:
+RETURN
+3:
+ testl %edx, %edx
+ jne 1b
+ jmp 2b
+END_
+
+BEGIN dec_if_positive %esi
+ movl ($v), %eax
+ movl 4($v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ js 1f
+ movl %eax, ($v)
+ movl %edx, 4($v)
+1:
+END
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 000000000000..71e080de3352
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,224 @@
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+.macro SAVE reg
+ pushl %\reg
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET \reg, 0
+.endm
+
+.macro RESTORE reg
+ popl %\reg
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE \reg
+.endm
+
+.macro read64 reg
+ movl %ebx, %eax
+ movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+ LOCK_PREFIX
+ cmpxchg8b (\reg)
+.endm
+
+ENTRY(atomic64_read_cx8)
+ CFI_STARTPROC
+
+ read64 %ecx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_read_cx8)
+
+ENTRY(atomic64_set_cx8)
+ CFI_STARTPROC
+
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_set_cx8)
+
+ENTRY(atomic64_xchg_cx8)
+ CFI_STARTPROC
+
+ movl %ebx, %eax
+ movl %ecx, %edx
+1:
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+ CFI_STARTPROC
+ SAVE ebp
+ SAVE ebx
+ SAVE esi
+ SAVE edi
+
+ movl %eax, %esi
+ movl %edx, %edi
+ movl %ecx, %ebp
+
+ read64 %ebp
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l %esi, %ebx
+ \insc\()l %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE edi
+ RESTORE esi
+ RESTORE ebx
+ RESTORE ebp
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l $1, %ebx
+ \insc\()l $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(atomic64_dec_if_positive_cx8)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ subl $1, %ebx
+ sbb $0, %ecx
+ js 2f
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+2:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_dec_if_positive_cx8)
+
+ENTRY(atomic64_add_unless_cx8)
+ CFI_STARTPROC
+ SAVE ebp
+ SAVE ebx
+/* these just push these two parameters on the stack */
+ SAVE edi
+ SAVE esi
+
+ movl %ecx, %ebp
+ movl %eax, %esi
+ movl %edx, %edi
+
+ read64 %ebp
+1:
+ cmpl %eax, 0(%esp)
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl %esi, %ebx
+ adcl %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+ movl $1, %eax
+3:
+ addl $8, %esp
+ CFI_ADJUST_CFA_OFFSET -8
+ RESTORE ebx
+ RESTORE ebp
+ ret
+4:
+ cmpl %edx, 4(%esp)
+ jne 2b
+ xorl %eax, %eax
+ jmp 3b
+ CFI_ENDPROC
+ENDPROC(atomic64_add_unless_cx8)
+
+ENTRY(atomic64_inc_not_zero_cx8)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64 %esi
+1:
+ testl %eax, %eax
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl $1, %ebx
+ adcl $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ RESTORE ebx
+ ret
+4:
+ testl %edx, %edx
+ jne 2b
+ jmp 3b
+ CFI_ENDPROC
+ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 15acecf0d7aa..41fcf00e49df 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -60,7 +60,7 @@ ENTRY(call_rwsem_down_write_failed)
ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
- decw %dx /* do nothing if still outstanding active readers */
+ decl %edx /* do nothing if still outstanding active readers */
jnz 1f
save_common_regs
movq %rax,%rdi
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 5eb1ba74a3a9..12e4d2d3c110 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -448,6 +448,20 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+void __init fixup_early_ioremap(void)
+{
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i]) {
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ early_ioremap_init();
+}
+
static int __init check_early_ioremap_leak(void)
{
int count = 0;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 1a8faf09afed..792854003ed3 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -18,6 +18,7 @@
#include <asm/e820.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/io.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -128,6 +129,7 @@ static int __init parse_reservetop(char *arg)
address = memparse(arg, &arg);
reserve_top_address(address);
+ fixup_early_ioremap();
return 0;
}
early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648f..38512d0c4742 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -461,7 +461,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
* node, it must now point to the fake node ID.
*/
for (j = 0; j < MAX_LOCAL_APIC; j++)
- if (apicid_to_node[j] == nid)
+ if (apicid_to_node[j] == nid &&
+ fake_apicid_to_node[j] == NUMA_NO_NODE)
fake_apicid_to_node[j] = i;
}
for (i = 0; i < num_nodes; i++)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2c505ee71014..b28d2f1253bb 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -31,8 +31,9 @@ static struct op_x86_model_spec *model;
static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
-/* 0 == registered but off, 1 == registered and on */
-static int nmi_enabled = 0;
+/* must be protected with get_online_cpus()/put_online_cpus(): */
+static int nmi_enabled;
+static int ctr_running;
struct op_counter_config counter_config[OP_MAX_COUNTER];
@@ -61,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self,
{
struct die_args *args = (struct die_args *)data;
int ret = NOTIFY_DONE;
- int cpu = smp_processor_id();
switch (val) {
case DIE_NMI:
case DIE_NMI_IPI:
- model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+ if (ctr_running)
+ model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
+ else if (!nmi_enabled)
+ break;
+ else
+ model->stop(&__get_cpu_var(cpu_msrs));
ret = NOTIFY_STOP;
break;
default:
@@ -95,24 +100,36 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
static void nmi_cpu_start(void *dummy)
{
struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
- model->start(msrs);
+ if (!msrs->controls)
+ WARN_ON_ONCE(1);
+ else
+ model->start(msrs);
}
static int nmi_start(void)
{
+ get_online_cpus();
on_each_cpu(nmi_cpu_start, NULL, 1);
+ ctr_running = 1;
+ put_online_cpus();
return 0;
}
static void nmi_cpu_stop(void *dummy)
{
struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
- model->stop(msrs);
+ if (!msrs->controls)
+ WARN_ON_ONCE(1);
+ else
+ model->stop(msrs);
}
static void nmi_stop(void)
{
+ get_online_cpus();
on_each_cpu(nmi_cpu_stop, NULL, 1);
+ ctr_running = 0;
+ put_online_cpus();
}
#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
@@ -252,7 +269,10 @@ static int nmi_switch_event(void)
if (nmi_multiplex_on() < 0)
return -EINVAL; /* not necessary */
- on_each_cpu(nmi_cpu_switch, NULL, 1);
+ get_online_cpus();
+ if (ctr_running)
+ on_each_cpu(nmi_cpu_switch, NULL, 1);
+ put_online_cpus();
return 0;
}
@@ -295,6 +315,7 @@ static void free_msrs(void)
kfree(per_cpu(cpu_msrs, i).controls);
per_cpu(cpu_msrs, i).controls = NULL;
}
+ nmi_shutdown_mux();
}
static int allocate_msrs(void)
@@ -307,14 +328,21 @@ static int allocate_msrs(void)
per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
GFP_KERNEL);
if (!per_cpu(cpu_msrs, i).counters)
- return 0;
+ goto fail;
per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
GFP_KERNEL);
if (!per_cpu(cpu_msrs, i).controls)
- return 0;
+ goto fail;
}
+ if (!nmi_setup_mux())
+ goto fail;
+
return 1;
+
+fail:
+ free_msrs();
+ return 0;
}
static void nmi_cpu_setup(void *dummy)
@@ -336,49 +364,6 @@ static struct notifier_block profile_exceptions_nb = {
.priority = 2
};
-static int nmi_setup(void)
-{
- int err = 0;
- int cpu;
-
- if (!allocate_msrs())
- err = -ENOMEM;
- else if (!nmi_setup_mux())
- err = -ENOMEM;
- else
- err = register_die_notifier(&profile_exceptions_nb);
-
- if (err) {
- free_msrs();
- nmi_shutdown_mux();
- return err;
- }
-
- /* We need to serialize save and setup for HT because the subset
- * of msrs are distinct for save and setup operations
- */
-
- /* Assume saved/restored counters are the same on all CPUs */
- model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
- for_each_possible_cpu(cpu) {
- if (!cpu)
- continue;
-
- memcpy(per_cpu(cpu_msrs, cpu).counters,
- per_cpu(cpu_msrs, 0).counters,
- sizeof(struct op_msr) * model->num_counters);
-
- memcpy(per_cpu(cpu_msrs, cpu).controls,
- per_cpu(cpu_msrs, 0).controls,
- sizeof(struct op_msr) * model->num_controls);
-
- mux_clone(cpu);
- }
- on_each_cpu(nmi_cpu_setup, NULL, 1);
- nmi_enabled = 1;
- return 0;
-}
-
static void nmi_cpu_restore_registers(struct op_msrs *msrs)
{
struct op_msr *counters = msrs->counters;
@@ -412,20 +397,24 @@ static void nmi_cpu_shutdown(void *dummy)
apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
apic_write(APIC_LVTERR, v);
nmi_cpu_restore_registers(msrs);
+ if (model->cpu_down)
+ model->cpu_down();
}
-static void nmi_shutdown(void)
+static void nmi_cpu_up(void *dummy)
{
- struct op_msrs *msrs;
+ if (nmi_enabled)
+ nmi_cpu_setup(dummy);
+ if (ctr_running)
+ nmi_cpu_start(dummy);
+}
- nmi_enabled = 0;
- on_each_cpu(nmi_cpu_shutdown, NULL, 1);
- unregister_die_notifier(&profile_exceptions_nb);
- nmi_shutdown_mux();
- msrs = &get_cpu_var(cpu_msrs);
- model->shutdown(msrs);
- free_msrs();
- put_cpu_var(cpu_msrs);
+static void nmi_cpu_down(void *dummy)
+{
+ if (ctr_running)
+ nmi_cpu_stop(dummy);
+ if (nmi_enabled)
+ nmi_cpu_shutdown(dummy);
}
static int nmi_create_files(struct super_block *sb, struct dentry *root)
@@ -457,7 +446,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
return 0;
}
-#ifdef CONFIG_SMP
static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
void *data)
{
@@ -465,10 +453,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
switch (action) {
case CPU_DOWN_FAILED:
case CPU_ONLINE:
- smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
+ smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
break;
case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
+ smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
break;
}
return NOTIFY_DONE;
@@ -477,7 +465,75 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
static struct notifier_block oprofile_cpu_nb = {
.notifier_call = oprofile_cpu_notifier
};
-#endif
+
+static int nmi_setup(void)
+{
+ int err = 0;
+ int cpu;
+
+ if (!allocate_msrs())
+ return -ENOMEM;
+
+ /* We need to serialize save and setup for HT because the subset
+ * of msrs are distinct for save and setup operations
+ */
+
+ /* Assume saved/restored counters are the same on all CPUs */
+ err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+ if (err)
+ goto fail;
+
+ for_each_possible_cpu(cpu) {
+ if (!cpu)
+ continue;
+
+ memcpy(per_cpu(cpu_msrs, cpu).counters,
+ per_cpu(cpu_msrs, 0).counters,
+ sizeof(struct op_msr) * model->num_counters);
+
+ memcpy(per_cpu(cpu_msrs, cpu).controls,
+ per_cpu(cpu_msrs, 0).controls,
+ sizeof(struct op_msr) * model->num_controls);
+
+ mux_clone(cpu);
+ }
+
+ nmi_enabled = 0;
+ ctr_running = 0;
+ barrier();
+ err = register_die_notifier(&profile_exceptions_nb);
+ if (err)
+ goto fail;
+
+ get_online_cpus();
+ register_cpu_notifier(&oprofile_cpu_nb);
+ on_each_cpu(nmi_cpu_setup, NULL, 1);
+ nmi_enabled = 1;
+ put_online_cpus();
+
+ return 0;
+fail:
+ free_msrs();
+ return err;
+}
+
+static void nmi_shutdown(void)
+{
+ struct op_msrs *msrs;
+
+ get_online_cpus();
+ unregister_cpu_notifier(&oprofile_cpu_nb);
+ on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+ nmi_enabled = 0;
+ ctr_running = 0;
+ put_online_cpus();
+ barrier();
+ unregister_die_notifier(&profile_exceptions_nb);
+ msrs = &get_cpu_var(cpu_msrs);
+ model->shutdown(msrs);
+ free_msrs();
+ put_cpu_var(cpu_msrs);
+}
#ifdef CONFIG_PM
@@ -687,9 +743,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
return -ENODEV;
}
-#ifdef CONFIG_SMP
- register_cpu_notifier(&oprofile_cpu_nb);
-#endif
/* default values, can be overwritten by model */
ops->create_files = nmi_create_files;
ops->setup = nmi_setup;
@@ -716,12 +769,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
void op_nmi_exit(void)
{
- if (using_nmi) {
+ if (using_nmi)
exit_sysfs();
-#ifdef CONFIG_SMP
- unregister_cpu_notifier(&oprofile_cpu_nb);
-#endif
- }
- if (model->exit)
- model->exit();
}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 090cbbec7dbd..b67a6b5aa8d4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -30,13 +30,10 @@
#include "op_counter.h"
#define NUM_COUNTERS 4
-#define NUM_CONTROLS 4
#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
#define NUM_VIRT_COUNTERS 32
-#define NUM_VIRT_CONTROLS 32
#else
#define NUM_VIRT_COUNTERS NUM_COUNTERS
-#define NUM_VIRT_CONTROLS NUM_CONTROLS
#endif
#define OP_EVENT_MASK 0x0FFF
@@ -105,102 +102,6 @@ static u32 get_ibs_caps(void)
return ibs_caps;
}
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < NUM_COUNTERS; i++) {
- if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- }
-
- for (i = 0; i < NUM_CONTROLS; i++) {
- if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
- }
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* setup reset_value */
- for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
- if (counter_config[i].enabled
- && msrs->counters[op_x86_virt_to_phys(i)].addr)
- reset_value[i] = counter_config[i].count;
- else
- reset_value[i] = 0;
- }
-
- /* clear all counters */
- for (i = 0; i < NUM_CONTROLS; ++i) {
- if (unlikely(!msrs->controls[i].addr)) {
- if (counter_config[i].enabled && !smp_processor_id())
- /*
- * counter is reserved, this is on all
- * cpus, so report only for cpu #0
- */
- op_x86_warn_reserved(i);
- continue;
- }
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- /* avoid a false detection of ctr overflows in NMI handler */
- for (i = 0; i < NUM_COUNTERS; ++i) {
- if (unlikely(!msrs->counters[i].addr))
- continue;
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
-
- /* setup counter registers */
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
- /* setup control registers */
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
/*
* 16-bit Linear Feedback Shift Register (LFSR)
*
@@ -365,6 +266,125 @@ static void op_amd_stop_ibs(void)
wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ /* enable active counters */
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!reset_value[virt])
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[virt]);
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+}
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+ int i;
+
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (!msrs->counters[i].addr)
+ continue;
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+}
+
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+ int i;
+
+ for (i = 0; i < NUM_COUNTERS; i++) {
+ if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+ goto fail;
+ if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ goto fail;
+ }
+ /* both registers must be reserved */
+ msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+ msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ continue;
+ fail:
+ if (!counter_config[i].enabled)
+ continue;
+ op_x86_warn_reserved(i);
+ op_amd_shutdown(msrs);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ /* setup reset_value */
+ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+ if (counter_config[i].enabled
+ && msrs->counters[op_x86_virt_to_phys(i)].addr)
+ reset_value[i] = counter_config[i].count;
+ else
+ reset_value[i] = 0;
+ }
+
+ /* clear all counters */
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (!msrs->controls[i].addr)
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ op_x86_warn_in_use(i);
+ val &= model->reserved;
+ wrmsrl(msrs->controls[i].addr, val);
+ /*
+ * avoid a false detection of ctr overflows in NMI
+ * handler
+ */
+ wrmsrl(msrs->counters[i].addr, -1LL);
+ }
+
+ /* enable active counters */
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!reset_value[virt])
+ continue;
+
+ /* setup counter registers */
+ wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+ /* setup control registers */
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[virt]);
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+
+ if (ibs_caps)
+ setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static void op_amd_cpu_shutdown(void)
+{
+ if (ibs_caps)
+ setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+}
+
static int op_amd_check_ctrs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
@@ -425,42 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
op_amd_stop_ibs();
}
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < NUM_COUNTERS; ++i) {
- if (msrs->counters[i].addr)
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- }
- for (i = 0; i < NUM_CONTROLS; ++i) {
- if (msrs->controls[i].addr)
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-}
-
-static u8 ibs_eilvt_off;
-
-static inline void apic_init_ibs_nmi_per_cpu(void *arg)
-{
- ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
-}
-
-static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
-{
- setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
-}
-
-static int init_ibs_nmi(void)
+static int __init_ibs_nmi(void)
{
#define IBSCTL_LVTOFFSETVAL (1 << 8)
#define IBSCTL 0x1cc
struct pci_dev *cpu_cfg;
int nodes;
u32 value = 0;
+ u8 ibs_eilvt_off;
- /* per CPU setup */
- on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
+ ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
nodes = 0;
cpu_cfg = NULL;
@@ -490,22 +484,15 @@ static int init_ibs_nmi(void)
return 0;
}
-/* uninitialize the APIC for the IBS interrupts if needed */
-static void clear_ibs_nmi(void)
-{
- if (ibs_caps)
- on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
-}
-
/* initialize the APIC for the IBS interrupts if available */
-static void ibs_init(void)
+static void init_ibs(void)
{
ibs_caps = get_ibs_caps();
if (!ibs_caps)
return;
- if (init_ibs_nmi()) {
+ if (__init_ibs_nmi()) {
ibs_caps = 0;
return;
}
@@ -514,14 +501,6 @@ static void ibs_init(void)
(unsigned)ibs_caps);
}
-static void ibs_exit(void)
-{
- if (!ibs_caps)
- return;
-
- clear_ibs_nmi();
-}
-
static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
static int setup_ibs_files(struct super_block *sb, struct dentry *root)
@@ -570,27 +549,22 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
static int op_amd_init(struct oprofile_operations *ops)
{
- ibs_init();
+ init_ibs();
create_arch_files = ops->create_files;
ops->create_files = setup_ibs_files;
return 0;
}
-static void op_amd_exit(void)
-{
- ibs_exit();
-}
-
struct op_x86_model_spec op_amd_spec = {
.num_counters = NUM_COUNTERS,
- .num_controls = NUM_CONTROLS,
+ .num_controls = NUM_COUNTERS,
.num_virt_counters = NUM_VIRT_COUNTERS,
.reserved = MSR_AMD_EVENTSEL_RESERVED,
.event_mask = OP_EVENT_MASK,
.init = op_amd_init,
- .exit = op_amd_exit,
.fill_in_addresses = &op_amd_fill_in_addresses,
.setup_ctrs = &op_amd_setup_ctrs,
+ .cpu_down = &op_amd_cpu_shutdown,
.check_ctrs = &op_amd_check_ctrs,
.start = &op_amd_start,
.stop = &op_amd_stop,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e6a160a4684a..182558dd5515 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -385,8 +385,26 @@ static unsigned int get_stagger(void)
static unsigned long reset_value[NUM_COUNTERS_NON_HT];
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+ int i;
-static void p4_fill_in_addresses(struct op_msrs * const msrs)
+ for (i = 0; i < num_counters; ++i) {
+ if (msrs->counters[i].addr)
+ release_perfctr_nmi(msrs->counters[i].addr);
+ }
+ /*
+ * some of the control registers are specially reserved in
+ * conjunction with the counter registers (hence the starting offset).
+ * This saves a few bits.
+ */
+ for (i = num_counters; i < num_controls; ++i) {
+ if (msrs->controls[i].addr)
+ release_evntsel_nmi(msrs->controls[i].addr);
+ }
+}
+
+static int p4_fill_in_addresses(struct op_msrs * const msrs)
{
unsigned int i;
unsigned int addr, cccraddr, stag;
@@ -468,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
}
}
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!counter_config[i].enabled)
+ continue;
+ if (msrs->controls[i].addr)
+ continue;
+ op_x86_warn_reserved(i);
+ p4_shutdown(msrs);
+ return -EBUSY;
+ }
+
+ return 0;
}
@@ -668,26 +698,6 @@ static void p4_stop(struct op_msrs const * const msrs)
}
}
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (msrs->counters[i].addr)
- release_perfctr_nmi(msrs->counters[i].addr);
- }
- /*
- * some of the control registers are specially reserved in
- * conjunction with the counter registers (hence the starting offset).
- * This saves a few bits.
- */
- for (i = num_counters; i < num_controls; ++i) {
- if (msrs->controls[i].addr)
- release_evntsel_nmi(msrs->controls[i].addr);
- }
-}
-
-
#ifdef CONFIG_SMP
struct op_x86_model_spec op_p4_ht2_spec = {
.num_counters = NUM_COUNTERS_HT2,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b5..d769cda54082 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -30,19 +30,46 @@ static int counter_width = 32;
static u64 *reset_value;
-static void ppro_fill_in_addresses(struct op_msrs * const msrs)
+static void ppro_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0; i < num_counters; i++) {
- if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
- msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+ for (i = 0; i < num_counters; ++i) {
+ if (!msrs->counters[i].addr)
+ continue;
+ release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+ release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+ }
+ if (reset_value) {
+ kfree(reset_value);
+ reset_value = NULL;
}
+}
+
+static int ppro_fill_in_addresses(struct op_msrs * const msrs)
+{
+ int i;
for (i = 0; i < num_counters; i++) {
- if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
- msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+ if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+ goto fail;
+ if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
+ release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+ goto fail;
+ }
+ /* both registers must be reserved */
+ msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+ msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+ continue;
+ fail:
+ if (!counter_config[i].enabled)
+ continue;
+ op_x86_warn_reserved(i);
+ ppro_shutdown(msrs);
+ return -EBUSY;
}
+
+ return 0;
}
@@ -78,26 +105,17 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
/* clear all counters */
for (i = 0; i < num_counters; ++i) {
- if (unlikely(!msrs->controls[i].addr)) {
- if (counter_config[i].enabled && !smp_processor_id())
- /*
- * counter is reserved, this is on all
- * cpus, so report only for cpu #0
- */
- op_x86_warn_reserved(i);
+ if (!msrs->controls[i].addr)
continue;
- }
rdmsrl(msrs->controls[i].addr, val);
if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
op_x86_warn_in_use(i);
val &= model->reserved;
wrmsrl(msrs->controls[i].addr, val);
- }
-
- /* avoid a false detection of ctr overflows in NMI handler */
- for (i = 0; i < num_counters; ++i) {
- if (unlikely(!msrs->counters[i].addr))
- continue;
+ /*
+ * avoid a false detection of ctr overflows in NMI *
+ * handler
+ */
wrmsrl(msrs->counters[i].addr, -1LL);
}
@@ -189,25 +207,6 @@ static void ppro_stop(struct op_msrs const * const msrs)
}
}
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (msrs->counters[i].addr)
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- }
- for (i = 0; i < num_counters; ++i) {
- if (msrs->controls[i].addr)
- release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
- }
- if (reset_value) {
- kfree(reset_value);
- reset_value = NULL;
- }
-}
-
-
struct op_x86_model_spec op_ppro_spec = {
.num_counters = 2,
.num_controls = 2,
@@ -239,11 +238,11 @@ static void arch_perfmon_setup_counters(void)
if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
current_cpu_data.x86_model == 15) {
eax.split.version_id = 2;
- eax.split.num_events = 2;
+ eax.split.num_counters = 2;
eax.split.bit_width = 40;
}
- num_counters = eax.split.num_events;
+ num_counters = eax.split.num_counters;
op_arch_perfmon_spec.num_counters = num_counters;
op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index ff82a755edd4..89017fa1fd63 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -40,10 +40,10 @@ struct op_x86_model_spec {
u64 reserved;
u16 event_mask;
int (*init)(struct oprofile_operations *ops);
- void (*exit)(void);
- void (*fill_in_addresses)(struct op_msrs * const msrs);
+ int (*fill_in_addresses)(struct op_msrs * const msrs);
void (*setup_ctrs)(struct op_x86_model_spec const *model,
struct op_msrs const * const msrs);
+ void (*cpu_down)(void);
int (*check_ctrs)(struct pt_regs * const regs,
struct op_msrs const * const msrs);
void (*start)(struct op_msrs const * const msrs);
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 8bf2fcb88d04..1cdc02cf8fa4 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -247,6 +247,10 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
u32 size;
int i;
+ /* Must have extended configuration space */
+ if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
+ return;
+
/* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
offset = fixed_bar_cap(dev->bus, dev->devfn);
if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||