summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-09-24 14:03:37 +0200
committerJiri Kosina <jkosina@suse.cz>2010-09-24 14:03:37 +0200
commit73e6d6c6467771838c1fc6949e6768a12ed72464 (patch)
tree94dd60891869a0a275cd332c4e675cf0b101fcfa /arch/x86/kernel
parent12e5272585901217e919da810c801e5084ee0cb1 (diff)
parenta850ea30374ebed32a0724742601861853fde869 (diff)
Merge branch 'master' into upstream
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c6
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event.c59
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c96
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c4
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/hpet.c31
-rw-r--r--arch/x86/kernel/hw_breakpoint.c40
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/kprobes.c25
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/smpboot.c51
-rw-r--r--arch/x86/kernel/trampoline.c17
-rw-r--r--arch/x86/kernel/tsc.c38
18 files changed, 289 insertions, 124 deletions
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4dc0084ec1b1..f1efebaf5510 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1728,6 +1728,8 @@ __apicdebuginit(void) print_IO_APIC(void)
struct irq_pin_list *entry;
cfg = desc->chip_data;
+ if (!cfg)
+ continue;
entry = cfg->irq_2_pin;
if (!entry)
continue;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 7b598b84c902..f744f54cb248 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -698,9 +698,11 @@ void __init uv_system_init(void)
for (j = 0; j < 64; j++) {
if (!test_bit(j, &present))
continue;
- uv_blade_info[blade].pnode = (i * 64 + j);
+ pnode = (i * 64 + j);
+ uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
+ max_pnode = max(pnode, max_pnode);
blade++;
}
}
@@ -738,7 +740,6 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
- max_pnode = max(pnode, max_pnode);
}
/* Add blade/pnode info for nodes without cpus */
@@ -750,7 +751,6 @@ void __init uv_system_init(void)
pnode = (paddr >> m_val) & pnode_mask;
blade = boot_pnode_to_blade(pnode);
uv_node_to_blade[nid] = blade;
- max_pnode = max(pnode, max_pnode);
}
map_gru_high(max_pnode);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 60a57b13082d..ba5f62f45f01 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -669,7 +669,7 @@ bool cpu_has_amd_erratum(const int *erratum)
}
/* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 8) | cpu->x86_mask;
+ ms = (cpu->x86_model << 4) | cpu->x86_mask;
while ((range = *erratum++))
if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
(ms >= AMD_MODEL_RANGE_START(range)) &&
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 224392d8fe8c..5e975298fa81 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -530,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = -ENOMEM;
goto out;
}
- if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
kfree(b);
err = -ENOMEM;
goto out;
@@ -543,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
#ifndef CONFIG_SMP
cpumask_setall(b->cpus);
#else
- cpumask_copy(b->cpus, c->llc_shared_map);
+ cpumask_set_cpu(cpu, b->cpus);
#endif
per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index c2a8b26d4fea..d9368eeda309 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -202,10 +202,11 @@ static int therm_throt_process(bool new_event, int event, int level)
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
+static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+ unsigned int cpu)
{
int err;
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
if (err)
@@ -251,7 +252,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev);
+ err = thermal_throttle_add_dev(sys_dev, cpu);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
@@ -287,7 +288,7 @@ static __init int thermal_throttle_init_device(void)
#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+ err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
WARN_ON(err);
}
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f2da20fda02d..3efdf2870a35 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1154,7 +1154,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
/*
* event overflow
*/
- handled = 1;
+ handled++;
data.period = event->hw.last_period;
if (!x86_perf_event_set_period(event))
@@ -1200,12 +1200,20 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
+struct pmu_nmi_state {
+ unsigned int marked;
+ int handled;
+};
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
+
static int __kprobes
perf_event_nmi_handler(struct notifier_block *self,
unsigned long cmd, void *__args)
{
struct die_args *args = __args;
- struct pt_regs *regs;
+ unsigned int this_nmi;
+ int handled;
if (!atomic_read(&active_events))
return NOTIFY_DONE;
@@ -1214,22 +1222,47 @@ perf_event_nmi_handler(struct notifier_block *self,
case DIE_NMI:
case DIE_NMI_IPI:
break;
-
+ case DIE_NMIUNKNOWN:
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+ /* let the kernel handle the unknown nmi */
+ return NOTIFY_DONE;
+ /*
+ * This one is a PMU back-to-back nmi. Two events
+ * trigger 'simultaneously' raising two back-to-back
+ * NMIs. If the first NMI handles both, the latter
+ * will be empty and daze the CPU. So, we drop it to
+ * avoid false-positive 'unknown nmi' messages.
+ */
+ return NOTIFY_STOP;
default:
return NOTIFY_DONE;
}
- regs = args->regs;
-
apic_write(APIC_LVTPC, APIC_DM_NMI);
- /*
- * Can't rely on the handled return value to say it was our NMI, two
- * events could trigger 'simultaneously' raising two back-to-back NMIs.
- *
- * If the first NMI handles both, the latter will be empty and daze
- * the CPU.
- */
- x86_pmu.handle_irq(regs);
+
+ handled = x86_pmu.handle_irq(args->regs);
+ if (!handled)
+ return NOTIFY_DONE;
+
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if ((handled > 1) ||
+ /* the next nmi could be a back-to-back nmi */
+ ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+ (__get_cpu_var(pmu_nmi).handled > 1))) {
+ /*
+ * We could have two subsequent back-to-back nmis: The
+ * first handles more than one counter, the 2nd
+ * handles only one counter and the 3rd handles no
+ * counter.
+ *
+ * This is the 2nd nmi because the previous was
+ * handling more than one counter. We will mark the
+ * next (3rd) and then drop it if unhandled.
+ */
+ __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
+ __get_cpu_var(pmu_nmi).handled = handled;
+ }
return NOTIFY_STOP;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 214ac860ebe0..ee05c90012d2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added)
* Intel Errata AAP53 (model 30)
* Intel Errata BD53 (model 44)
*
- * These chips need to be 'reset' when adding counters by programming
- * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
- * either in sequence on the same PMC or on different PMCs.
+ * The official story:
+ * These chips need to be 'reset' when adding counters by programming the
+ * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ * in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
*/
-static void intel_pmu_nhm_enable_all(int added)
+static void intel_pmu_nhm_workaround(void)
{
- if (added) {
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int i;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ static const unsigned long nhm_magic[4] = {
+ 0x4300B5,
+ 0x4300D2,
+ 0x4300B1,
+ 0x4300B1
+ };
+ struct perf_event *event;
+ int i;
+
+ /*
+ * The Errata requires below steps:
+ * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+ * 2) Configure 4 PERFEVTSELx with the magic events and clear
+ * the corresponding PMCx;
+ * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+ * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+ * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+ */
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+ /*
+ * The real steps we choose are a little different from above.
+ * A) To reduce MSR operations, we don't run step 1) as they
+ * are already cleared before this function is called;
+ * B) Call x86_perf_event_update to save PMCx before configuring
+ * PERFEVTSELx with magic number;
+ * C) With step 5), we do clear only when the PERFEVTSELx is
+ * not used currently.
+ * D) Call x86_perf_event_set_period to restore PMCx;
+ */
+
+ /* We always operate 4 pairs of PERF Counters */
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event)
+ x86_perf_event_update(event);
+ }
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+ for (i = 0; i < 4; i++) {
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+ wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+ }
- for (i = 0; i < 3; i++) {
- struct perf_event *event = cpuc->events[i];
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
- if (!event)
- continue;
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event) {
+ x86_perf_event_set_period(event);
__x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
- }
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ } else
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
}
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added)
+ intel_pmu_nhm_workaround();
intel_pmu_enable_all(added);
}
@@ -667,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
int bit, loops;
- u64 ack, status;
+ u64 status;
+ int handled = 0;
perf_sample_data_init(&data, 0);
@@ -683,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
+ intel_pmu_ack_status(status);
if (++loops > 100) {
WARN_ONCE(1, "perfevents: irq loop stuck!\n");
perf_event_print_debug();
@@ -691,19 +738,22 @@ again:
}
inc_irq_stat(apic_perf_irqs);
- ack = status;
intel_pmu_lbr_read();
/*
* PEBS overflow sets bit 62 in the global status register
*/
- if (__test_and_clear_bit(62, (unsigned long *)&status))
+ if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+ handled++;
x86_pmu.drain_pebs(regs);
+ }
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
+ handled++;
+
if (!test_bit(bit, cpuc->active_mask))
continue;
@@ -716,8 +766,6 @@ again:
x86_pmu_stop(event);
}
- intel_pmu_ack_status(ack);
-
/*
* Repeat if there is more work to be done:
*/
@@ -727,7 +775,7 @@ again:
done:
intel_pmu_enable_all(0);
- return 1;
+ return handled;
}
static struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index febb12cea795..b560db3305be 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -497,6 +497,8 @@ static int p4_hw_config(struct perf_event *event)
event->hw.config |= event->attr.config &
(p4_config_pack_escr(P4_ESCR_MASK_HT) |
p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
+
+ event->hw.config &= ~P4_CCCR_FORCE_OVF;
}
rc = x86_setup_perfctr(event);
@@ -690,7 +692,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
inc_irq_stat(apic_perf_irqs);
}
- return handled > 0;
+ return handled;
}
/*
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e5cc7e82e60d..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,7 +18,6 @@
#include <asm/apic.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/hpet.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -192,21 +191,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
-/*
- * Force the read back of the CMP register in hpet_next_event()
- * to work around the problem that the CMP register write seems to be
- * delayed. See hpet_next_event() for details.
- *
- * We do this on all SMBUS incarnations for now until we have more
- * information about the affected chipsets.
- */
-static void __init ati_hpet_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_HPET_TIMER
- hpet_readback_cmp = 1;
-#endif
-}
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -236,8 +220,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
- { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
- PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
{}
};
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ff4c453e13f3..fa8c1b8e09fb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -334,7 +334,7 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
- movl $pa(swapper_pg_dir),%eax
+ movl pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $X86_CR0_PG,%eax
@@ -614,6 +614,8 @@ ignore_int:
.align 4
ENTRY(initial_code)
.long i386_start_kernel
+ENTRY(initial_page_table)
+ .long pa(swapper_pg_dir)
/*
* BSS section
@@ -629,6 +631,10 @@ ENTRY(swapper_pg_dir)
#endif
swapper_pg_fixmap:
.fill 1024,4,0
+#ifdef CONFIG_X86_TRAMPOLINE
+ENTRY(trampoline_pg_dir)
+ .fill 1024,4,0
+#endif
ENTRY(empty_zero_page)
.fill 4096,1,0
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 351f9c0fea1f..410fdb3f1939 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -35,7 +35,6 @@
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
u8 hpet_msi_disable;
-u8 hpet_readback_cmp;
#ifdef CONFIG_PCI_MSI
static unsigned long hpet_num_timers;
@@ -395,23 +394,27 @@ static int hpet_next_event(unsigned long delta,
* at that point and we would wait for the next hpet interrupt
* forever. We found out that reading the CMP register back
* forces the transfer so we can rely on the comparison with
- * the counter register below.
+ * the counter register below. If the read back from the
+ * compare register does not match the value we programmed
+ * then we might have a real hardware problem. We can not do
+ * much about it here, but at least alert the user/admin with
+ * a prominent warning.
*
- * That works fine on those ATI chipsets, but on newer Intel
- * chipsets (ICH9...) this triggers due to an erratum: Reading
- * the comparator immediately following a write is returning
- * the old value.
+ * An erratum on some chipsets (ICH9,..), results in
+ * comparator read immediately following a write returning old
+ * value. Workaround for this is to read this value second
+ * time, when first read returns old value.
*
- * We restrict the read back to the affected ATI chipsets (set
- * by quirks) and also run it with hpet=verbose for debugging
- * purposes.
+ * In fact the write to the comparator register is delayed up
+ * to two HPET cycles so the workaround we tried to restrict
+ * the readback to those known to be borked ATI chipsets
+ * failed miserably. So we give up on optimizations forever
+ * and penalize all HPET incarnations unconditionally.
*/
- if (hpet_readback_cmp || hpet_verbose) {
- u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
-
- if (cmp != cnt)
+ if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
+ if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
printk_once(KERN_WARNING
- "hpet: compare register read back failed.\n");
+ "hpet: compare register read back failed.\n");
}
return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a474ec37c32f..ff15c9dcc25d 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -206,11 +206,27 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
- /* Len */
- switch (x86_len) {
- case X86_BREAKPOINT_LEN_X:
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
+ *gen_type = HW_BREAKPOINT_X;
*gen_len = sizeof(long);
+ return 0;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ switch (x86_len) {
case X86_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
@@ -229,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
return -EINVAL;
}
- /* Type */
- switch (x86_type) {
- case X86_BREAKPOINT_EXECUTE:
- *gen_type = HW_BREAKPOINT_X;
- break;
- case X86_BREAKPOINT_WRITE:
- *gen_type = HW_BREAKPOINT_W;
- break;
- case X86_BREAKPOINT_RW:
- *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
- break;
- default:
- return -EINVAL;
- }
-
return 0;
}
@@ -316,9 +317,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
ret = -EINVAL;
switch (info->len) {
- case X86_BREAKPOINT_LEN_X:
- align = sizeof(long) -1;
- break;
case X86_BREAKPOINT_LEN_1:
align = 0;
break;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f11f5ce668f..a46cb3522c0c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -40,6 +40,7 @@
static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
static struct i387_fxsave_struct fx_scratch __cpuinitdata;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1bfb6cf4dd55..770ebfb349e9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -709,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
struct hlist_node *node, *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+ kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
@@ -740,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
/* another task is sharing our hash bucket */
continue;
+ orig_ret_address = (unsigned long)ri->ret_addr;
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ correct_ret_addr = ri->ret_addr;
+ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__get_cpu_var(current_kprobe) = &ri->rp->kp;
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+ ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__get_cpu_var(current_kprobe) = NULL;
}
- orig_ret_address = (unsigned long)ri->ret_addr;
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
@@ -759,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
break;
}
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b008e7883207..c3a4fbb2b996 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1014,6 +1014,8 @@ void __init setup_arch(char **cmdline_p)
paging_init();
x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ setup_trampoline_page_table();
+
tboot_probe();
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a5e928b0cb5f..8b3bfc4dd708 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
#endif
/* State of each CPU */
@@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
+
+/*
+ * We need this for trampoline_base protection from concurrent accesses when
+ * off- and onlining cores wildly.
+ */
+static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
+
+void cpu_hotplug_driver_lock()
+{
+ mutex_lock(&x86_cpu_hotplug_driver_mutex);
+}
+
+void cpu_hotplug_driver_unlock()
+{
+ mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+}
+
+ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
+ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
#else
static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
@@ -281,6 +299,18 @@ notrace static void __cpuinit start_secondary(void *unused)
* fragile that we want to limit the things done here to the
* most necessary things.
*/
+
+#ifdef CONFIG_X86_32
+ /*
+ * Switch away from the trampoline page-table
+ *
+ * Do this before cpu_init() because it needs to access per-cpu
+ * data which may not be mapped in the trampoline page-table.
+ */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+#endif
+
vmi_bringup();
cpu_init();
preempt_disable();
@@ -299,12 +329,6 @@ notrace static void __cpuinit start_secondary(void *unused)
legacy_pic->chip->unmask(0);
}
-#ifdef CONFIG_X86_32
- while (low_mappings)
- cpu_relax();
- __flush_tlb_all();
-#endif
-
/* This must be done before setting cpu_online_mask */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
@@ -750,6 +774,7 @@ do_rest:
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
+ initial_page_table = __pa(&trampoline_pg_dir);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
@@ -897,20 +922,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-#ifdef CONFIG_X86_32
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
- flush_tlb_all();
- low_mappings = 1;
-
err = do_boot_cpu(apicid, cpu);
- zap_low_mappings(false);
- low_mappings = 0;
-#else
- err = do_boot_cpu(apicid, cpu);
-#endif
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742d..e2a595257390 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,6 +1,7 @@
#include <linux/io.h>
#include <asm/trampoline.h>
+#include <asm/pgtable.h>
#include <asm/e820.h>
#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
@@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void)
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
+
+void __init setup_trampoline_page_table(void)
+{
+#ifdef CONFIG_X86_32
+ /* Copy kernel address range */
+ clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ /* Initialize low mappings */
+ clone_pgd_range(trampoline_pg_dir,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS,
+ KERNEL_PGD_BOUNDARY));
+#endif
+}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ce8e50239332..26a863a9c2a8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
local_irq_restore(flags);
}
+static unsigned long long cyc2ns_suspend;
+
+void save_sched_clock_state(void)
+{
+ if (!sched_clock_stable)
+ return;
+
+ cyc2ns_suspend = sched_clock();
+}
+
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void restore_sched_clock_state(void)
+{
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!sched_clock_stable)
+ return;
+
+ local_irq_save(flags);
+
+ __get_cpu_var(cyc2ns_offset) = 0;
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cyc2ns_offset, cpu) = offset;
+
+ local_irq_restore(flags);
+}
+
#ifdef CONFIG_CPU_FREQ
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency