summaryrefslogtreecommitdiff
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S2
-rw-r--r--arch/x86_64/kernel/aperture.c4
-rw-r--r--arch/x86_64/kernel/apic.c46
-rw-r--r--arch/x86_64/kernel/crash.c35
-rw-r--r--arch/x86_64/kernel/e820.c40
-rw-r--r--arch/x86_64/kernel/early_printk.c13
-rw-r--r--arch/x86_64/kernel/entry.S22
-rw-r--r--arch/x86_64/kernel/genapic.c33
-rw-r--r--arch/x86_64/kernel/genapic_flat.c90
-rw-r--r--arch/x86_64/kernel/head.S34
-rw-r--r--arch/x86_64/kernel/head64.c2
-rw-r--r--arch/x86_64/kernel/i387.c2
-rw-r--r--arch/x86_64/kernel/i8259.c27
-rw-r--r--arch/x86_64/kernel/io_apic.c37
-rw-r--r--arch/x86_64/kernel/irq.c48
-rw-r--r--arch/x86_64/kernel/kprobes.c296
-rw-r--r--arch/x86_64/kernel/machine_kexec.c231
-rw-r--r--arch/x86_64/kernel/mce.c115
-rw-r--r--arch/x86_64/kernel/mce_intel.c4
-rw-r--r--arch/x86_64/kernel/mpparse.c55
-rw-r--r--arch/x86_64/kernel/nmi.c4
-rw-r--r--arch/x86_64/kernel/process.c76
-rw-r--r--arch/x86_64/kernel/reboot.c87
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S143
-rw-r--r--arch/x86_64/kernel/setup.c89
-rw-r--r--arch/x86_64/kernel/setup64.c24
-rw-r--r--arch/x86_64/kernel/signal.c35
-rw-r--r--arch/x86_64/kernel/smp.c76
-rw-r--r--arch/x86_64/kernel/smpboot.c382
-rw-r--r--arch/x86_64/kernel/suspend.c19
-rw-r--r--arch/x86_64/kernel/time.c42
-rw-r--r--arch/x86_64/kernel/traps.c27
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S132
34 files changed, 1747 insertions, 526 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fda..48f9e2c19cd6 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_PM) += suspend.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index a4c630034cd4..185faa911db5 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -67,7 +67,7 @@ wakeup_code:
shll $4, %eax
addl $(gdta - wakeup_code), %eax
movl %eax, gdt_48a +2 - wakeup_code
- lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is
+ lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
# appropriate
movl $1, %eax # protected mode (PE) bit
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 504e63474993..c9a6b812e926 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -40,11 +40,7 @@ int fix_aperture __initdata = 1;
static u32 __init allocate_aperture(void)
{
-#ifdef CONFIG_DISCONTIGMEM
pg_data_t *nd0 = NODE_DATA(0);
-#else
- pg_data_t *nd0 = &contig_page_data;
-#endif
u32 aper_size;
void *p;
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd4..375d369570ca 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
}
}
-void disconnect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
{
if (pic_mode) {
/*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
outb(0x70, 0x22);
outb(0x00, 0x23);
}
+ else {
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write_around(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write_around(APIC_LVT0, value);
+ }
+ else {
+ /* Disable LVT0 */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+ }
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(
+ APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write_around(APIC_LVT1, value);
+ }
}
void disable_local_APIC(void)
@@ -285,7 +321,7 @@ void __init init_bsp_APIC(void)
apic_write_around(APIC_LVT1, value);
}
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
{
unsigned int value, ver, maxlvt;
@@ -534,7 +570,7 @@ static struct sys_device device_lapic = {
.cls = &lapic_sysclass,
};
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void)
local_irq_enable();
}
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
{
local_irq_disable(); /* FIXME: Do we need this? --RR */
setup_APIC_timer(calibration_result);
local_irq_enable();
}
-void __init disable_APIC_timer(void)
+void __cpuinit disable_APIC_timer(void)
{
if (using_apic_timer) {
unsigned long v;
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 000000000000..d7fa4248501c
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,35 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ /* This function is only called after the system
+ * has paniced or is otherwise in a critical state.
+ * The minimum amount of code to allow a kexec'd kernel
+ * to run successfully needs to happen here.
+ *
+ * In practice this means shooting down the other cpus in
+ * an SMP system.
+ */
+}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 7c154dfff64a..b548dea4e5b9 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/string.h>
+#include <linux/kexec.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -184,6 +185,40 @@ unsigned long __init e820_end_of_ram(void)
}
/*
+ * Compute how much memory is missing in a range.
+ * Unlike the other functions in this file the arguments are in page numbers.
+ */
+unsigned long __init
+e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long ram = 0;
+ unsigned long start = start_pfn << PAGE_SHIFT;
+ unsigned long end = end_pfn << PAGE_SHIFT;
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long last, addr;
+
+ if (ei->type != E820_RAM ||
+ ei->addr+ei->size <= start ||
+ ei->addr >= end)
+ continue;
+
+ addr = round_up(ei->addr, PAGE_SIZE);
+ if (addr < start)
+ addr = start;
+
+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (last >= end)
+ last = end;
+
+ if (last > addr)
+ ram += last - addr;
+ }
+ return ((end - start) - ram) >> PAGE_SHIFT;
+}
+
+/*
* Mark e820 reserved areas as busy for the resource manager.
*/
void __init e820_reserve_resources(void)
@@ -191,8 +226,6 @@ void __init e820_reserve_resources(void)
int i;
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
res = alloc_bootmem_low(sizeof(struct resource));
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
@@ -212,6 +245,9 @@ void __init e820_reserve_resources(void)
*/
request_resource(res, &code_resource);
request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+ request_resource(res, &crashk_res);
+#endif
}
}
}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index e3a19e8ebbf8..9631c747c5e3 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -2,20 +2,24 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
+#include <linux/tty.h>
#include <asm/io.h>
#include <asm/processor.h>
/* Simple VGA output */
#ifdef __i386__
+#include <asm/setup.h>
#define VGABASE (__ISA_IO_base + 0xb8000)
#else
+#include <asm/bootsetup.h>
#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
#endif
-#define MAX_YPOS 25
-#define MAX_XPOS 80
+#define MAX_YPOS max_ypos
+#define MAX_XPOS max_xpos
+static int max_ypos = 25, max_xpos = 80;
static int current_ypos = 1, current_xpos = 0;
static void early_vga_write(struct console *con, const char *str, unsigned n)
@@ -196,7 +200,10 @@ int __init setup_early_printk(char *opt)
} else if (!strncmp(buf, "ttyS", 4)) {
early_serial_init(buf);
early_console = &early_serial_console;
- } else if (!strncmp(buf, "vga", 3)) {
+ } else if (!strncmp(buf, "vga", 3)
+ && SCREEN_INFO.orig_video_isVGA == 1) {
+ max_xpos = SCREEN_INFO.orig_video_cols;
+ max_ypos = SCREEN_INFO.orig_video_lines;
early_console = &early_vga_console;
}
early_console_initialized = 1;
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 28817490fdc6..096d470e280f 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -76,7 +76,7 @@
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
- xorq %rax, %rax
+ xorl %eax, %eax
pushq %rax /* ss */
CFI_ADJUST_CFA_OFFSET 8
pushq %rax /* rsp */
@@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn)
testl $3,CS(%rdi)
je 1f
swapgs
-1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count
+1: incl %gs:pda_irqcount # RED-PEN should check preempt count
movq %gs:pda_irqstackptr,%rax
cmoveq %rax,%rsp
pushq %rdi # save old stack
@@ -436,7 +436,7 @@ ENTRY(common_interrupt)
ret_from_intr:
popq %rdi
cli
- subl $1,%gs:pda_irqcount
+ decl %gs:pda_irqcount
#ifdef CONFIG_DEBUG_INFO
movq RBP(%rdi),%rbp
#endif
@@ -494,7 +494,7 @@ retint_signal:
sti
SAVE_REST
movq $-1,ORIG_RAX(%rsp)
- xorq %rsi,%rsi # oldset
+ xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
RESTORE_REST
@@ -752,7 +752,7 @@ child_rip:
movq %rsi, %rdi
call *%rax
# exit
- xorq %rdi, %rdi
+ xorl %edi, %edi
call do_exit
/*
@@ -918,3 +918,15 @@ ENTRY(machine_check)
ENTRY(call_debug)
zeroentry do_call_debug
+ENTRY(call_softirq)
+ movq %gs:pda_irqstackptr,%rax
+ pushq %r15
+ movq %rsp,%r15
+ incl %gs:pda_irqcount
+ cmove %rax,%rsp
+ call __do_softirq
+ movq %r15,%rsp
+ decl %gs:pda_irqcount
+ popq %r15
+ ret
+
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 69b9c25a8fc4..30c843a5efdd 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
extern struct genapic apic_cluster;
extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
struct genapic *genapic = &apic_flat;
@@ -44,12 +45,7 @@ void __init clustered_apic_check(void)
u8 clusters, max_cluster;
u8 id;
u8 cluster_cnt[NUM_APIC_CLUSTERS];
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- /* AMD always uses flat mode right now */
- genapic = &apic_flat;
- goto print;
- }
+ int num_cpus = 0;
#if defined(CONFIG_ACPI_BUS)
/*
@@ -64,15 +60,34 @@ void __init clustered_apic_check(void)
#endif
memset(cluster_cnt, 0, sizeof(cluster_cnt));
-
for (i = 0; i < NR_CPUS; i++) {
id = bios_cpu_apicid[i];
- if (id != BAD_APICID)
- cluster_cnt[APIC_CLUSTERID(id)]++;
+ if (id == BAD_APICID)
+ continue;
+ num_cpus++;
+ cluster_cnt[APIC_CLUSTERID(id)]++;
}
+ /* Don't use clustered mode on AMD platforms. */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ genapic = &apic_physflat;
+#ifndef CONFIG_CPU_HOTPLUG
+ /* In the CPU hotplug case we cannot use broadcast mode
+ because that opens a race when a CPU is removed.
+ Stay at physflat mode in this case.
+ It is bad to do this unconditionally though. Once
+ we have ACPI platform support for CPU hotplug
+ we should detect hotplug capablity from ACPI tables and
+ only do this when really needed. -AK */
+ if (num_cpus <= 8)
+ genapic = &apic_flat;
+#endif
+ goto print;
+ }
+
clusters = 0;
max_cluster = 0;
+
for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
if (cluster_cnt[i] > 0) {
++clusters;
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad04226..adc96282a9e2 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -2,7 +2,7 @@
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
*
- * Flat APIC subarch code. Maximum 8 CPUs, logical delivery.
+ * Flat APIC subarch code.
*
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
@@ -18,7 +18,6 @@
#include <asm/smp.h>
#include <asm/ipi.h>
-
static cpumask_t flat_target_cpus(void)
{
return cpu_online_map;
@@ -45,22 +44,6 @@ static void flat_init_apic_ldr(void)
apic_write_around(APIC_LDR, val);
}
-static void flat_send_IPI_allbutself(int vector)
-{
- /*
- * if there are no other CPUs in the system then
- * we get an APIC send error if we try to broadcast.
- * thus we have to avoid sending IPIs in this case.
- */
- if (num_online_cpus() > 1)
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static void flat_send_IPI_all(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
{
unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +76,17 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
local_irq_restore(flags);
}
+static void flat_send_IPI_allbutself(int vector)
+{
+ if (((num_online_cpus()) - 1) >= 1)
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
static int flat_apic_id_registered(void)
{
return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
@@ -125,3 +119,63 @@ struct genapic apic_flat = {
.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
};
+
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+
+static cpumask_t physflat_target_cpus(void)
+{
+ return cpumask_of_cpu(0);
+}
+
+static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+ send_IPI_mask_sequence(cpumask, vector);
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+ cpumask_t allbutme = cpu_online_map;
+ int me = get_cpu();
+ cpu_clear(me, allbutme);
+ physflat_send_IPI_mask(allbutme, vector);
+ put_cpu();
+}
+
+static void physflat_send_IPI_all(int vector)
+{
+ physflat_send_IPI_mask(cpu_online_map, vector);
+}
+
+static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = first_cpu(cpumask);
+ if ((unsigned)cpu < NR_CPUS)
+ return x86_cpu_to_apicid[cpu];
+ else
+ return BAD_APICID;
+}
+
+struct genapic apic_physflat = {
+ .name = "physical flat",
+ .int_delivery_mode = dest_LowestPrio,
+ .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+ .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST,
+ .target_cpus = physflat_target_cpus,
+ .apic_id_registered = flat_apic_id_registered,
+ .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_mask = physflat_send_IPI_mask,
+ .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81e..98ff5eb32b9a 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -137,14 +137,14 @@ startup_64:
wrmsr
/* Setup cr0 */
- xorq %rax, %rax
- btsq $31, %rax /* Enable paging */
- btsq $0, %rax /* Enable protected mode */
- btsq $1, %rax /* Enable MP */
- btsq $4, %rax /* Enable ET */
- btsq $5, %rax /* Enable NE */
- btsq $16, %rax /* Enable WP */
- btsq $18, %rax /* Enable AM */
+#define CR0_PM 1 /* protected mode */
+#define CR0_MP (1<<1)
+#define CR0_ET (1<<4)
+#define CR0_NE (1<<5)
+#define CR0_WP (1<<16)
+#define CR0_AM (1<<18)
+#define CR0_PAGING (1<<31)
+ movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
/* Make changes effective */
movq %rax, %cr0
@@ -248,23 +248,23 @@ ENTRY(_stext)
*/
.org 0x1000
ENTRY(init_level4_pgt)
- .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
.fill 255,8,0
- .quad 0x000000000010a007
+ .quad 0x000000000000a007 + __PHYSICAL_START
.fill 254,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
.org 0x2000
ENTRY(level3_ident_pgt)
- .quad 0x0000000000104007
+ .quad 0x0000000000004007 + __PHYSICAL_START
.fill 511,8,0
.org 0x3000
ENTRY(level3_kernel_pgt)
.fill 510,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad 0x0000000000105007 /* -> level2_kernel_pgt */
+ .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */
.fill 1,8,0
.org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
.org 0xa000
ENTRY(level3_physmem_pgt)
- .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+ .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
.org 0xb000
#ifdef CONFIG_ACPI_SLEEP
ENTRY(wakeup_level4_pgt)
- .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
.fill 255,8,0
- .quad 0x000000000010a007
+ .quad 0x000000000000a007 + __PHYSICAL_START
.fill 254,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
#endif
.data
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 0f8c78dcd38c..cf6ab147a2a5 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
s = strstr(saved_command_line, "earlyprintk=");
if (s != NULL)
setup_early_printk(s);
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
s = strstr(saved_command_line, "numa=");
if (s != NULL)
numa_setup(s+5);
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57ce..d9b22b633e39 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
-void __init fpu_init(void)
+void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 7873d9ba8814..a89169095129 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -157,14 +157,13 @@ static unsigned int startup_8259A_irq(unsigned int irq)
}
static struct hw_interrupt_type i8259A_irq_type = {
- "XT-PIC",
- startup_8259A_irq,
- shutdown_8259A_irq,
- enable_8259A_irq,
- disable_8259A_irq,
- mask_and_ack_8259A,
- end_8259A_irq,
- NULL
+ .typename = "XT-PIC",
+ .startup = startup_8259A_irq,
+ .shutdown = shutdown_8259A_irq,
+ .enable = enable_8259A_irq,
+ .disable = disable_8259A_irq,
+ .ack = mask_and_ack_8259A,
+ .end = end_8259A_irq,
};
/*
@@ -415,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
return 0;
}
+static int i8259A_shutdown(struct sys_device *dev)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-1 */
+ return 0;
+}
+
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a8..d206d7e49cf5 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
/*
* Find the pin to which IRQ[irq] (ISA) is connected
*/
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
{
int i;
@@ -1064,7 +1064,6 @@ void print_all_local_APICs (void)
void __apicdebuginit print_PIC(void)
{
- extern spinlock_t i8259A_lock;
unsigned int v;
unsigned long flags;
@@ -1132,12 +1131,44 @@ static void __init enable_IO_APIC(void)
*/
void disable_IO_APIC(void)
{
+ int pin;
/*
* Clear the IO-APIC before rebooting:
*/
clear_IO_APIC();
- disconnect_bsp_APIC();
+ /*
+ * If the i82559 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrups can be delivered.
+ */
+ pin = find_isa_irq_pin(0, mp_ExtINT);
+ if (pin != -1) {
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = 7; /* ExtInt */
+ entry.vector = 0;
+ entry.dest.physical.physical_dest = 0;
+
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ disconnect_bsp_APIC(pin != -1);
}
/*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb4..849a20aec7ca 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>
@@ -106,3 +107,50 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
return 1;
}
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+ static int warned;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ if (irq == 2)
+ continue;
+
+ cpus_and(mask, irq_affinity[irq], map);
+ if (any_online_cpu(mask) == NR_CPUS) {
+ printk("Breaking affinity for irq %i\n", irq);
+ mask = map;
+ }
+ if (irq_desc[irq].handler->set_affinity)
+ irq_desc[irq].handler->set_affinity(irq, mask);
+ else if (irq_desc[irq].action && !(warned++))
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+}
+#endif
+
+extern void call_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ pending = local_softirq_pending();
+ /* Switch to interrupt stack */
+ if (pending)
+ call_softirq();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index f77f8a0ff187..5c6dc7051482 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -27,6 +27,8 @@
* <prasanna@in.ibm.com> adapted for x86_64
* 2005-Mar Roland McGrath <roland@redhat.com>
* Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May Rusty Lynch <rusty.lynch@intel.com>
+ * Added function return probes functionality
*/
#include <linux/config.h>
@@ -36,23 +38,19 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/preempt.h>
-#include <linux/moduleloader.h>
+#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include <asm/kdebug.h>
static DECLARE_MUTEX(kprobe_mutex);
-/* kprobe_status settings */
-#define KPROBE_HIT_ACTIVE 0x00000001
-#define KPROBE_HIT_SS 0x00000002
-
static struct kprobe *current_kprobe;
static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
+static struct kprobe *kprobe_prev;
+static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
static struct pt_regs jprobe_saved_regs;
static long *jprobe_saved_rsp;
-static kprobe_opcode_t *get_insn_slot(void);
-static void free_insn_slot(kprobe_opcode_t *slot);
void jprobe_return_end(void);
/* copy of the kernel stack at the probe fire time */
@@ -214,6 +212,21 @@ void arch_copy_kprobe(struct kprobe *p)
BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
*ripdisp = disp;
}
+ p->opcode = *p->addr;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range((unsigned long) p->addr,
+ (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ *p->addr = p->opcode;
+ flush_icache_range((unsigned long) p->addr,
+ (unsigned long) p->addr + sizeof(kprobe_opcode_t));
}
void arch_remove_kprobe(struct kprobe *p)
@@ -223,10 +236,29 @@ void arch_remove_kprobe(struct kprobe *p)
down(&kprobe_mutex);
}
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+static inline void save_previous_kprobe(void)
{
- *p->addr = p->opcode;
- regs->rip = (unsigned long)p->addr;
+ kprobe_prev = current_kprobe;
+ kprobe_status_prev = kprobe_status;
+ kprobe_old_rflags_prev = kprobe_old_rflags;
+ kprobe_saved_rflags_prev = kprobe_saved_rflags;
+}
+
+static inline void restore_previous_kprobe(void)
+{
+ current_kprobe = kprobe_prev;
+ kprobe_status = kprobe_status_prev;
+ kprobe_old_rflags = kprobe_old_rflags_prev;
+ kprobe_saved_rflags = kprobe_saved_rflags_prev;
+}
+
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ current_kprobe = p;
+ kprobe_saved_rflags = kprobe_old_rflags
+ = (regs->eflags & (TF_MASK | IF_MASK));
+ if (is_IF_modifier(p->ainsn.insn))
+ kprobe_saved_rflags &= ~IF_MASK;
}
static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -240,6 +272,25 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
regs->rip = (unsigned long)p->ainsn.insn;
}
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+ unsigned long *sara = (unsigned long *)regs->rsp;
+ struct kretprobe_instance *ri;
+
+ if ((ri = get_free_rp_inst(rp)) != NULL) {
+ ri->rp = rp;
+ ri->task = current;
+ ri->ret_addr = (kprobe_opcode_t *) *sara;
+
+ /* Replace the return addr with trampoline addr */
+ *sara = (unsigned long) &kretprobe_trampoline;
+
+ add_rp_inst(ri);
+ } else {
+ rp->nmissed++;
+ }
+}
+
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled thorough out this function.
@@ -264,9 +315,30 @@ int kprobe_handler(struct pt_regs *regs)
regs->eflags |= kprobe_saved_rflags;
unlock_kprobes();
goto no_kprobe;
+ } else if (kprobe_status == KPROBE_HIT_SSDONE) {
+ /* TODO: Provide re-entrancy from
+ * post_kprobes_handler() and avoid exception
+ * stack corruption while single-stepping on
+ * the instruction of the new probe.
+ */
+ arch_disarm_kprobe(p);
+ regs->rip = (unsigned long)p->addr;
+ ret = 1;
+ } else {
+ /* We have reentered the kprobe_handler(), since
+ * another probe was hit while within the
+ * handler. We here save the original kprobe
+ * variables and just single step on instruction
+ * of the new probe without calling any user
+ * handlers.
+ */
+ save_previous_kprobe();
+ set_current_kprobe(p, regs);
+ p->nmissed++;
+ prepare_singlestep(p, regs);
+ kprobe_status = KPROBE_REENTER;
+ return 1;
}
- disarm_kprobe(p, regs);
- ret = 1;
} else {
p = current_kprobe;
if (p->break_handler && p->break_handler(p, regs)) {
@@ -296,11 +368,7 @@ int kprobe_handler(struct pt_regs *regs)
}
kprobe_status = KPROBE_HIT_ACTIVE;
- current_kprobe = p;
- kprobe_saved_rflags = kprobe_old_rflags
- = (regs->eflags & (TF_MASK | IF_MASK));
- if (is_IF_modifier(p->ainsn.insn))
- kprobe_saved_rflags &= ~IF_MASK;
+ set_current_kprobe(p, regs);
if (p->pre_handler && p->pre_handler(p, regs))
/* handler has already set things up, so skip ss setup */
@@ -317,6 +385,78 @@ no_kprobe:
}
/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+ asm volatile ( ".global kretprobe_trampoline\n"
+ "kretprobe_trampoline: \n"
+ "nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct hlist_head *head;
+ struct hlist_node *node, *tmp;
+ unsigned long orig_ret_address = 0;
+ unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+
+ head = kretprobe_inst_table_head(current);
+
+ /*
+ * It is possible to have multiple instances associated with a given
+ * task either because an multiple functions in the call path
+ * have a return probe installed on them, and/or more then one return
+ * return probe was registered for a target function.
+ *
+ * We can handle this because:
+ * - instances are always inserted at the head of the list
+ * - when multiple return probes are registered for the same
+ * function, the first instance's ret_addr will point to the
+ * real return address, and all the rest will point to
+ * kretprobe_trampoline
+ */
+ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ if (ri->rp && ri->rp->handler)
+ ri->rp->handler(ri, regs);
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ recycle_rp_inst(ri);
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
+ regs->rip = orig_ret_address;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+
+ /*
+ * By returning a non-zero value, we are telling
+ * kprobe_handler() that we have handled unlocking
+ * and re-enabling preemption.
+ */
+ return 1;
+}
+
+/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "int 3"
* instruction. To avoid the SMP problems that can occur when we
@@ -401,13 +541,22 @@ int post_kprobe_handler(struct pt_regs *regs)
if (!kprobe_running())
return 0;
- if (current_kprobe->post_handler)
+ if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
+ kprobe_status = KPROBE_HIT_SSDONE;
current_kprobe->post_handler(current_kprobe, regs, 0);
+ }
resume_execution(current_kprobe, regs);
regs->eflags |= kprobe_saved_rflags;
- unlock_kprobes();
+ /* Restore the original saved kprobes variables and continue. */
+ if (kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe();
+ goto out;
+ } else {
+ unlock_kprobes();
+ }
+out:
preempt_enable_no_resched();
/*
@@ -528,111 +677,12 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
-/*
- * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
- * By default on x86_64, pages we get from kmalloc or vmalloc are not
- * executable. Single-stepping an instruction on such a page yields an
- * oops. So instead of storing the instruction copies in their respective
- * kprobe objects, we allocate a page, map it executable, and store all the
- * instruction copies there. (We can allocate additional pages if somebody
- * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE
- * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
- * bytes.
- */
-#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
-struct kprobe_insn_page {
- struct hlist_node hlist;
- kprobe_opcode_t *insns; /* page of instruction slots */
- char slot_used[INSNS_PER_PAGE];
- int nused;
+static struct kprobe trampoline_p = {
+ .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+ .pre_handler = trampoline_probe_handler
};
-static struct hlist_head kprobe_insn_pages;
-
-/**
- * get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
- */
-static kprobe_opcode_t *get_insn_slot(void)
-{
- struct kprobe_insn_page *kip;
- struct hlist_node *pos;
-
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
- if (kip->nused < INSNS_PER_PAGE) {
- int i;
- for (i = 0; i < INSNS_PER_PAGE; i++) {
- if (!kip->slot_used[i]) {
- kip->slot_used[i] = 1;
- kip->nused++;
- return kip->insns + (i*MAX_INSN_SIZE);
- }
- }
- /* Surprise! No unused slots. Fix kip->nused. */
- kip->nused = INSNS_PER_PAGE;
- }
- }
-
- /* All out of space. Need to allocate a new page. Use slot 0.*/
- kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
- if (!kip) {
- return NULL;
- }
-
- /*
- * For the %rip-relative displacement fixups to be doable, we
- * need our instruction copy to be within +/- 2GB of any data it
- * might access via %rip. That is, within 2GB of where the
- * kernel image and loaded module images reside. So we allocate
- * a page in the module loading area.
- */
- kip->insns = module_alloc(PAGE_SIZE);
- if (!kip->insns) {
- kfree(kip);
- return NULL;
- }
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist, &kprobe_insn_pages);
- memset(kip->slot_used, 0, INSNS_PER_PAGE);
- kip->slot_used[0] = 1;
- kip->nused = 1;
- return kip->insns;
-}
-
-/**
- * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
- */
-static void free_insn_slot(kprobe_opcode_t *slot)
+int __init arch_init_kprobes(void)
{
- struct kprobe_insn_page *kip;
- struct hlist_node *pos;
-
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
- if (kip->insns <= slot
- && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
- int i = (slot - kip->insns) / MAX_INSN_SIZE;
- kip->slot_used[i] = 0;
- kip->nused--;
- if (kip->nused == 0) {
- /*
- * Page is no longer in use. Free it unless
- * it's the last one. We keep the last one
- * so as not to have to set it up again the
- * next time somebody inserts a probe.
- */
- hlist_del(&kip->hlist);
- if (hlist_empty(&kprobe_insn_pages)) {
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist,
- &kprobe_insn_pages);
- } else {
- module_free(NULL, kip->insns);
- kfree(kip);
- }
- }
- return;
- }
- }
+ return register_kprobe(&trampoline_p);
}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 000000000000..89fab51e20f4
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,231 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+
+static void init_level2_page(pmd_t *level2p, unsigned long addr)
+{
+ unsigned long end_addr;
+
+ addr &= PAGE_MASK;
+ end_addr = addr + PUD_SIZE;
+ while (addr < end_addr) {
+ set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ addr += PMD_SIZE;
+ }
+}
+
+static int init_level3_page(struct kimage *image, pud_t *level3p,
+ unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + PGDIR_SIZE;
+ while ((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ pmd_t *level2p;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level2p = (pmd_t *)page_address(page);
+ init_level2_page(level2p, addr);
+ set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+ addr += PUD_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+ pud_clear(level3p++);
+ addr += PUD_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_level4_page(struct kimage *image, pgd_t *level4p,
+ unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
+ while ((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ pud_t *level3p;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level3p = (pud_t *)page_address(page);
+ result = init_level3_page(image, level3p, addr, last_addr);
+ if (result) {
+ goto out;
+ }
+ set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+ addr += PGDIR_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+ pgd_clear(level4p++);
+ addr += PGDIR_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+ pgd_t *level4p;
+ level4p = (pgd_t *)__va(start_pgtable);
+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+ struct desc_ptr curidt;
+
+ /* x86-64 supports unaliged loads & stores */
+ curidt.size = limit;
+ curidt.address = (unsigned long)newidt;
+
+ __asm__ __volatile__ (
+ "lidtq %0\n"
+ : : "m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+ struct desc_ptr curgdt;
+
+ /* x86-64 supports unaligned loads & stores */
+ curgdt.size = limit;
+ curgdt.address = (unsigned long)newgdt;
+
+ __asm__ __volatile__ (
+ "lgdtq %0\n"
+ : : "m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+ __asm__ __volatile__ (
+ "\tmovl %0,%%ds\n"
+ "\tmovl %0,%%es\n"
+ "\tmovl %0,%%ss\n"
+ "\tmovl %0,%%fs\n"
+ "\tmovl %0,%%gs\n"
+ : : "a" (__KERNEL_DS)
+ );
+}
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+ unsigned long control_code_buffer,
+ unsigned long start_address,
+ unsigned long pgtable) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+ unsigned long start_pgtable, control_code_buffer;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + PAGE_SIZE;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result)
+ return result;
+
+ /* Place the code in the reboot code buffer */
+ memcpy(__va(control_code_buffer), relocate_new_kernel,
+ relocate_new_kernel_size);
+
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list;
+ unsigned long control_code_buffer;
+ unsigned long start_pgtable;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ /* Calculate the offsets */
+ page_list = image->head;
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + PAGE_SIZE;
+
+ /* Set the low half of the page table to my identity mapped
+ * page table for kexec. Leave the high half pointing at the
+ * kernel pages. Don't bother to flush the global pages
+ * as that will happen when I fully switch to my identity mapped
+ * page table anyway.
+ */
+ memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+ __flush_tlb();
+
+
+ /* The segment registers are funny things, they are
+ * automatically loaded from a table, in memory wherever you
+ * set them to a specific selector, but this table is never
+ * accessed again unless you set the segment to a different selector.
+ *
+ * The more common model are caches where the behide
+ * the scenes work is done, but is also dropped at arbitrary
+ * times.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /* The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0),0);
+ set_idt(phys_to_virt(0),0);
+ /* now call it */
+ rnk = (relocate_new_kernel_t) control_code_buffer;
+ (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f6..8aa56736cde3 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -15,6 +15,8 @@
#include <linux/sysdev.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -34,6 +36,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long console_logged;
static int notify_user;
static int rip_msr;
+static int mce_bootlog;
/*
* Lockless MCE logging infrastructure.
@@ -195,10 +198,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs);
- if (error_code != -1)
+ if (error_code >= 0)
rdtscll(m.tsc);
wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
- mce_log(&m);
+ if (error_code != -2)
+ mce_log(&m);
/* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it,
@@ -313,7 +317,7 @@ static void mce_init(void *dummy)
/* Log the machine checks left over from the previous reset.
This also clears all registers */
- do_machine_check(NULL, -1);
+ do_machine_check(NULL, mce_bootlog ? -1 : -2);
set_in_cr4(X86_CR4_MCE);
@@ -327,7 +331,7 @@ static void mce_init(void *dummy)
}
/* Add per CPU specific workarounds here */
-static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +341,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
}
}
-static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
@@ -352,7 +356,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
-void __init mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
@@ -411,7 +415,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
memset(mcelog.entry, 0, next * sizeof(struct mce));
mcelog.next = 0;
- synchronize_kernel();
+ synchronize_sched();
/* Collect entries that were still getting written before the synchronize. */
@@ -474,11 +478,17 @@ static int __init mcheck_disable(char *str)
}
/* mce=off disables machine check. Note you can reenable it later
- using sysfs */
+ using sysfs.
+ mce=bootlog Log MCEs from before booting. Disabled by default to work
+ around buggy BIOS that leave bogus MCEs. */
static int __init mcheck_enable(char *str)
{
+ if (*str == '=')
+ str++;
if (!strcmp(str, "off"))
mce_dont_init = 1;
+ else if (!strcmp(str, "bootlog"))
+ mce_bootlog = 1;
else
printk("mce= argument %s ignored. Please use /sys", str);
return 0;
@@ -514,10 +524,7 @@ static struct sysdev_class mce_sysclass = {
set_kset_name("machinecheck"),
};
-static struct sys_device device_mce = {
- .id = 0,
- .cls = &mce_sysclass,
-};
+static DEFINE_PER_CPU(struct sys_device, device_mce);
/* Why are there no generic functions for this? */
#define ACCESSOR(name, var, start) \
@@ -542,27 +549,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
+/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
+static __cpuinit int mce_create_device(unsigned int cpu)
+{
+ int err;
+ if (!mce_available(&cpu_data[cpu]))
+ return -EIO;
+
+ per_cpu(device_mce,cpu).id = cpu;
+ per_cpu(device_mce,cpu).cls = &mce_sysclass;
+
+ err = sysdev_register(&per_cpu(device_mce,cpu));
+
+ if (!err) {
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ }
+ return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit void mce_remove_device(unsigned int cpu)
+{
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ sysdev_unregister(&per_cpu(device_mce,cpu));
+}
+#endif
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ mce_create_device(cpu);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ mce_remove_device(cpu);
+ break;
+#endif
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_cpu_notifier = {
+ .notifier_call = mce_cpu_callback,
+};
+
static __init int mce_init_device(void)
{
int err;
+ int i = 0;
+
if (!mce_available(&boot_cpu_data))
return -EIO;
err = sysdev_class_register(&mce_sysclass);
- if (!err)
- err = sysdev_register(&device_mce);
- if (!err) {
- /* could create per CPU objects, but it is not worth it. */
- sysdev_create_file(&device_mce, &attr_bank0ctl);
- sysdev_create_file(&device_mce, &attr_bank1ctl);
- sysdev_create_file(&device_mce, &attr_bank2ctl);
- sysdev_create_file(&device_mce, &attr_bank3ctl);
- sysdev_create_file(&device_mce, &attr_bank4ctl);
- sysdev_create_file(&device_mce, &attr_tolerant);
- sysdev_create_file(&device_mce, &attr_check_interval);
- }
-
+
+ for_each_online_cpu(i) {
+ mce_create_device(i);
+ }
+
+ register_cpu_notifier(&mce_cpu_notifier);
misc_register(&mce_log_device);
return err;
-
}
+
device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069f..0be0a7959814 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
irq_exit();
}
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
{
u32 l, h;
int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
-void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 61a63be6b294..79c362d03e2e 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -23,6 +23,7 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi.h>
+#include <linux/module.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
@@ -45,7 +46,8 @@ int acpi_found_madt;
int apic_version [MAX_APICS];
unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
+unsigned char pci_bus_to_node [256];
+EXPORT_SYMBOL(pci_bus_to_node);
static int mp_current_pci_id = 0;
/* I/O APIC entries */
@@ -107,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __init MP_processor_info (struct mpc_config_processor *m)
{
- int ver;
+ int ver, cpu;
static int found_bsp=0;
if (!(m->mpc_cpuflag & CPU_ENABLED))
@@ -129,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
return;
}
- num_processors++;
+ cpu = num_processors++;
if (m->mpc_apicid > MAX_APICS) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
@@ -153,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
* in same order as logical cpu numbers. Hence the first
* entry is BSP, and so on.
*/
+ cpu = 0;
+
bios_cpu_apicid[0] = m->mpc_apicid;
x86_cpu_to_apicid[0] = m->mpc_apicid;
found_bsp = 1;
- } else {
- bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
- x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
- }
+ } else
+ cpu = num_processors - found_bsp;
+ bios_cpu_apicid[cpu] = m->mpc_apicid;
+ x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+
+ cpu_set(cpu, cpu_possible_map);
+ cpu_set(cpu, cpu_present_map);
}
static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -904,11 +911,20 @@ void __init mp_config_acpi_legacy_irqs (void)
return;
}
+#define MAX_GSI_NUM 4096
+
int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
{
int ioapic = -1;
int ioapic_pin = 0;
int idx, bit = 0;
+ static int pci_irq = 16;
+ /*
+ * Mapping between Global System Interrupts, which
+ * represent all possible interrupts, to the IRQs
+ * assigned to actual devices.
+ */
+ static int gsi_to_irq[MAX_GSI_NUM];
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
@@ -943,11 +959,34 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
- return gsi;
+ return gsi_to_irq[gsi];
}
mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+ if (edge_level) {
+ /*
+ * For PCI devices assign IRQs in order, avoiding gaps
+ * due to unused I/O APIC pins.
+ */
+ int irq = gsi;
+ if (gsi < MAX_GSI_NUM) {
+ if (gsi > 15)
+ gsi = pci_irq++;
+#ifdef CONFIG_ACPI_BUS
+ /*
+ * Don't assign IRQ used by ACPI SCI
+ */
+ if (gsi == acpi_fadt.sci_int)
+ gsi = pci_irq++;
+#endif
+ gsi_to_irq[irq] = gsi;
+ } else {
+ printk(KERN_ERR "GSI %u is too high\n", gsi);
+ return gsi;
+ }
+ }
+
io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac91..4e44d6e6b7e5 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
-static __init inline int nmi_known_cpu(void)
+static __cpuinit inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
}
/* Run after command line and cpu_init init, but before all other checks */
-void __init nmi_watchdog_default(void)
+void __cpuinit nmi_watchdog_default(void)
{
if (nmi_watchdog != NMI_DEFAULT)
return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index dce8bab4306c..7577f9d7a75d 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
*
* X86-64 port
* Andi Kleen.
- *
+ *
+ * CPU hotplug support - ashok.raj@intel.com
* $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
*/
@@ -18,6 +19,7 @@
#include <stdarg.h>
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -34,6 +36,7 @@
#include <linux/ptrace.h>
#include <linux/utsname.h>
#include <linux/random.h>
+#include <linux/kprobes.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -153,6 +156,29 @@ void cpu_idle_wait(void)
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+ idle_task_exit();
+ wbinvd();
+ mb();
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ while (1)
+ safe_halt();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
@@ -173,6 +199,8 @@ void cpu_idle (void)
idle = pm_idle;
if (!idle)
idle = default_idle;
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
idle();
}
@@ -203,7 +231,7 @@ static void mwait_idle(void)
}
}
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
@@ -293,6 +321,14 @@ void exit_thread(void)
{
struct task_struct *me = current;
struct thread_struct *t = &me->thread;
+
+ /*
+ * Remove function-return probe instances associated with this task
+ * and put them back on the free list. Do not insert an exit probe for
+ * this function, it will be disabled by kprobe_flush_task if you do.
+ */
+ kprobe_flush_task(me);
+
if (me->thread.io_bitmap_ptr) {
struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
@@ -312,6 +348,13 @@ void flush_thread(void)
struct task_struct *tsk = current;
struct thread_info *t = current_thread_info();
+ /*
+ * Remove function-return probe instances associated with this task
+ * and put them back on the free list. Do not insert an exit probe for
+ * this function, it will be disabled by kprobe_flush_task if you do.
+ */
+ kprobe_flush_task(tsk);
+
if (t->flags & _TIF_ABI_PENDING)
t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
@@ -439,6 +482,33 @@ out:
}
/*
+ * This function selects if the context switch from prev to next
+ * has to tweak the TSC disable bit in the cr4.
+ */
+static inline void disable_tsc(struct task_struct *prev_p,
+ struct task_struct *next_p)
+{
+ struct thread_info *prev, *next;
+
+ /*
+ * gcc should eliminate the ->thread_info dereference if
+ * has_secure_computing returns 0 at compile time (SECCOMP=n).
+ */
+ prev = prev_p->thread_info;
+ next = next_p->thread_info;
+
+ if (has_secure_computing(prev) || has_secure_computing(next)) {
+ /* slow path here */
+ if (has_secure_computing(prev) &&
+ !has_secure_computing(next)) {
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+ } else if (!has_secure_computing(prev) &&
+ has_secure_computing(next))
+ write_cr4(read_cr4() | X86_CR4_TSD);
+ }
+}
+
+/*
* This special macro can be used to load a debugging register
*/
#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
@@ -556,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
}
}
+ disable_tsc(prev_p, next_p);
+
return prev_p;
}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index be4b36f762cf..47f95687905f 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -66,34 +66,6 @@ static int __init reboot_setup(char *str)
__setup("reboot=", reboot_setup);
-#ifdef CONFIG_SMP
-static void smp_halt(void)
-{
- int cpuid = safe_smp_processor_id();
- static int first_entry = 1;
-
- if (reboot_force)
- return;
-
- if (first_entry) {
- first_entry = 0;
- smp_call_function((void *)machine_restart, NULL, 1, 0);
- }
-
- smp_stop_cpu();
-
- /* AP calling this. Just halt */
- if (cpuid != boot_cpu_id) {
- for (;;)
- asm("hlt");
- }
-
- /* Wait for all other CPUs to have run smp_stop_cpu */
- while (!cpus_empty(cpu_online_map))
- rep_nop();
-}
-#endif
-
static inline void kb_wait(void)
{
int i;
@@ -103,25 +75,44 @@ static inline void kb_wait(void)
break;
}
-void machine_restart(char * __unused)
+void machine_shutdown(void)
{
- int i;
+ /* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+ int reboot_cpu_id;
- printk("machine restart\n");
+ /* The boot cpu is always logical cpu 0 */
+ reboot_cpu_id = 0;
-#ifdef CONFIG_SMP
- smp_halt();
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+ reboot_cpu_id = smp_processor_id();
+ }
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+ /* O.K Now that I'm on the appropriate processor,
+ * stop all of the others.
+ */
+ smp_send_stop();
#endif
- if (!reboot_force) {
- local_irq_disable();
+ local_irq_disable();
+
#ifndef CONFIG_SMP
- disable_local_APIC();
+ disable_local_APIC();
#endif
- disable_IO_APIC();
- local_irq_enable();
- }
-
+
+ disable_IO_APIC();
+
+ local_irq_enable();
+}
+
+void machine_emergency_restart(void)
+{
+ int i;
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -146,18 +137,26 @@ void machine_restart(char * __unused)
}
}
-EXPORT_SYMBOL(machine_restart);
+void machine_restart(char * __unused)
+{
+ printk("machine restart\n");
+
+ if (!reboot_force) {
+ machine_shutdown();
+ }
+ machine_emergency_restart();
+}
void machine_halt(void)
{
}
-EXPORT_SYMBOL(machine_halt);
-
void machine_power_off(void)
{
+ if (!reboot_force) {
+ machine_shutdown();
+ }
if (pm_power_off)
pm_power_off();
}
-EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..d24fa9b72a2b
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+ /*
+ * Must be relocatable PIC code callable as a C function, that once
+ * it starts can not use the previous processes stack.
+ */
+ .globl relocate_new_kernel
+ .code64
+relocate_new_kernel:
+ /* %rdi page_list
+ * %rsi reboot_code_buffer
+ * %rdx start address
+ * %rcx page_table
+ * %r8 arg5
+ * %r9 arg6
+ */
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+ /* set a new stack at the bottom of our page... */
+ lea 4096(%rsi), %rsp
+
+ /* store the parameters back on the stack */
+ pushq %rdx /* store the start address */
+
+ /* Set cr0 to a known state:
+ * 31 1 == Paging enabled
+ * 18 0 == Alignment check disabled
+ * 16 0 == Write protect disabled
+ * 3 0 == No task switch
+ * 2 0 == Don't do FP software emulation.
+ * 0 1 == Proctected mode enabled
+ */
+ movq %cr0, %rax
+ andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+ orl $((1<<31)|(1<<0)), %eax
+ movq %rax, %cr0
+
+ /* Set cr4 to a known state:
+ * 10 0 == xmm exceptions disabled
+ * 9 0 == xmm registers instructions disabled
+ * 8 0 == performance monitoring counter disabled
+ * 7 0 == page global disabled
+ * 6 0 == machine check exceptions disabled
+ * 5 1 == physical address extension enabled
+ * 4 0 == page size extensions disabled
+ * 3 0 == Debug extensions disabled
+ * 2 0 == Time stamp disable (disabled)
+ * 1 0 == Protected mode virtual interrupts disabled
+ * 0 0 == VME disabled
+ */
+
+ movq $((1<<5)), %rax
+ movq %rax, %cr4
+
+ jmp 1f
+1:
+
+ /* Switch to the identity mapped page tables,
+ * and flush the TLB.
+ */
+ movq %rcx, %cr3
+
+ /* Do the copies */
+ movq %rdi, %rcx /* Put the page_list in %rcx */
+ xorq %rdi, %rdi
+ xorq %rsi, %rsi
+ jmp 1f
+
+0: /* top, read another word for the indirection page */
+
+ movq (%rbx), %rcx
+ addq $8, %rbx
+1:
+ testq $0x1, %rcx /* is it a destination page? */
+ jz 2f
+ movq %rcx, %rdi
+ andq $0xfffffffffffff000, %rdi
+ jmp 0b
+2:
+ testq $0x2, %rcx /* is it an indirection page? */
+ jz 2f
+ movq %rcx, %rbx
+ andq $0xfffffffffffff000, %rbx
+ jmp 0b
+2:
+ testq $0x4, %rcx /* is it the done indicator? */
+ jz 2f
+ jmp 3f
+2:
+ testq $0x8, %rcx /* is it the source indicator? */
+ jz 0b /* Ignore it otherwise */
+ movq %rcx, %rsi /* For ever source page do a copy */
+ andq $0xfffffffffffff000, %rsi
+
+ movq $512, %rcx
+ rep ; movsq
+ jmp 0b
+3:
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %rsp alone */
+
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+relocate_new_kernel_end:
+
+ .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+ .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 99f038ede23c..116a491e2961 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -40,6 +40,9 @@
#include <linux/acpi.h>
#include <linux/kallsyms.h>
#include <linux/edd.h>
+#include <linux/mmzone.h>
+#include <linux/kexec.h>
+
#include <asm/mtrr.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -348,7 +351,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (!memcmp(from, "mem=", 4))
parse_memopt(from+4, &from);
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
if (!memcmp(from, "numa=", 5))
numa_setup(from+5);
#endif
@@ -365,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (!memcmp(from, "noexec=", 7))
nonx_setup(from + 7);
+#ifdef CONFIG_KEXEC
+ /* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+ else if (!memcmp(from, "crashkernel=", 12)) {
+ unsigned long size, base;
+ size = memparse(from+12, &from);
+ if (*from == '@') {
+ base = memparse(from+1, &from);
+ /* FIXME: Do I want a sanity check
+ * to validate the memory range?
+ */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ }
+#endif
+
next_char:
c = *(from++);
if (!c)
@@ -377,17 +401,20 @@ static __init void parse_cmdline_early (char ** cmdline_p)
*cmdline_p = command_line;
}
-#ifndef CONFIG_DISCONTIGMEM
-static void __init contig_initmem_init(void)
+#ifndef CONFIG_NUMA
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long bootmap_size, bootmap;
- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
- if (bootmap == -1L)
- panic("Cannot find bootmem map of size %ld\n",bootmap_size);
- bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size);
+ unsigned long bootmap_size, bootmap;
+
+ memory_present(0, start_pfn, end_pfn);
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+ e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+ reserve_bootmem(bootmap, bootmap_size);
}
#endif
@@ -554,10 +581,10 @@ void __init setup_arch(char **cmdline_p)
acpi_numa_init();
#endif
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
numa_initmem_init(0, end_pfn);
#else
- contig_initmem_init();
+ contig_initmem_init(0, end_pfn);
#endif
/* Reserve direct mapping */
@@ -618,6 +645,15 @@ void __init setup_arch(char **cmdline_p)
}
}
#endif
+#ifdef CONFIG_KEXEC
+ if (crashk_res.start != crashk_res.end) {
+ reserve_bootmem(crashk_res.start,
+ crashk_res.end - crashk_res.start + 1);
+ }
+#endif
+
+ sparse_init();
+
paging_init();
check_ioapic();
@@ -669,7 +705,7 @@ void __init setup_arch(char **cmdline_p)
#endif
}
-static int __init get_model_name(struct cpuinfo_x86 *c)
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
@@ -685,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
}
-static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, eax, ebx, ecx, edx;
@@ -729,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
int cpu = smp_processor_id();
int node = 0;
unsigned bits;
- if (c->x86_num_cores == 1)
- return;
bits = 0;
while ((1 << bits) < c->x86_num_cores)
@@ -796,7 +830,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
return r;
}
-static void __init detect_ht(struct cpuinfo_x86 *c)
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
@@ -857,7 +891,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
/*
* find out the number of processor cores on the die
*/
-static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax;
@@ -875,7 +909,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
return 1;
}
-static void __init init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
{
/* Cache sizes */
unsigned n;
@@ -895,7 +929,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
c->x86_num_cores = intel_num_cpu_cores(c);
}
-void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
@@ -916,7 +950,7 @@ struct cpu_model_info {
/* Do some early cpuid on the boot CPU to get some parameter that are
needed before check_bugs. Everything advanced is in identify_cpu
below. */
-void __init early_identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
{
u32 tfms;
@@ -970,7 +1004,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-void __init identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
{
int i;
u32 xlvl;
@@ -1040,14 +1074,17 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_MCE
mcheck_init(c);
#endif
+ if (c == &boot_cpu_data)
+ mtrr_bp_init();
+ else
+ mtrr_ap_init();
#ifdef CONFIG_NUMA
- if (c != &boot_cpu_data)
- numa_add_cpu(c - cpu_data);
+ numa_add_cpu(smp_processor_id());
#endif
}
-void __init print_cpu_info(struct cpuinfo_x86 *c)
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
if (c->x86_model_id[0])
printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8b..34082c1cc41e 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/bootmem.h>
#include <linux/bitops.h>
+#include <linux/module.h>
#include <asm/bootsetup.h>
#include <asm/pda.h>
#include <asm/pgtable.h>
@@ -22,22 +23,15 @@
#include <asm/smp.h>
#include <asm/i387.h>
#include <asm/percpu.h>
-#include <asm/mtrr.h>
#include <asm/proto.h>
-#include <asm/mman.h>
-#include <asm/numa.h>
+#include <asm/sections.h>
char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
-cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
-extern struct task_struct init_task;
-
-extern unsigned char __per_cpu_start[], __per_cpu_end[];
-
-extern struct desc_ptr cpu_gdt_descr[];
struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
@@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void)
#endif
for (i = 0; i < NR_CPUS; i++) {
- unsigned char *ptr;
+ char *ptr;
if (!NODE_DATA(cpu_to_node(i))) {
printk("cpu with no node %d, num_online_nodes %d\n",
@@ -171,7 +165,7 @@ void syscall_init(void)
wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
}
-void __init check_efer(void)
+void __cpuinit check_efer(void)
{
unsigned long efer;
@@ -188,13 +182,9 @@ void __init check_efer(void)
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init.
*/
-void __init cpu_init (void)
+void __cpuinit cpu_init (void)
{
-#ifdef CONFIG_SMP
int cpu = stack_smp_processor_id();
-#else
- int cpu = smp_processor_id();
-#endif
struct tss_struct *t = &per_cpu(init_tss, cpu);
unsigned long v;
char *estacks = NULL;
@@ -214,7 +204,7 @@ void __init cpu_init (void)
printk("Initializing CPU#%d\n", cpu);
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 429c0269dc4e..d642fbf3da29 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -34,9 +34,9 @@
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs);
-void ia32_setup_frame(int sig, struct k_sigaction *ka,
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
sigset_t *set, struct pt_regs * regs);
asmlinkage long
@@ -238,7 +238,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
return (void __user *)round_down(rsp - size, 16);
}
-static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs)
{
struct rt_sigframe __user *frame;
@@ -327,20 +327,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return;
+ return 1;
give_sigsegv:
force_sigsegv(sig, current);
+ return 0;
}
/*
* OK, we're invoking a handler
*/
-static void
+static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
sigset_t *oldset, struct pt_regs *regs)
{
+ int ret;
+
#ifdef DEBUG_SIG
printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
current->pid, sig,
@@ -384,20 +387,23 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
#ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32)) {
if (ka->sa.sa_flags & SA_SIGINFO)
- ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+ ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
- ia32_setup_frame(sig, ka, oldset, regs);
+ ret = ia32_setup_frame(sig, ka, oldset, regs);
} else
#endif
- setup_rt_frame(sig, ka, info, oldset, regs);
+ ret = setup_rt_frame(sig, ka, info, oldset, regs);
- if (!(ka->sa.sa_flags & SA_NODEFER)) {
+ if (ret) {
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
- sigaddset(&current->blocked,sig);
+ if (!(ka->sa.sa_flags & SA_NODEFER))
+ sigaddset(&current->blocked,sig);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
}
+
+ return ret;
}
/*
@@ -417,10 +423,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
* kernel mode. Just return without doing anything
* if so.
*/
- if ((regs->cs & 3) != 3)
+ if (!user_mode(regs))
return 1;
- if (try_to_freeze(0))
+ if (try_to_freeze())
goto no_signal;
if (!oldset)
@@ -434,11 +440,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
* inside the kernel.
*/
if (current->thread.debugreg7)
- asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7));
+ set_debugreg(current->thread.debugreg7, 7);
/* Whee! Actually deliver the signal. */
- handle_signal(signr, &info, &ka, oldset, regs);
- return 1;
+ return handle_signal(signr, &info, &ka, oldset, regs);
}
no_signal:
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1d..e5958220d6b8 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void)
} else
leave_mm(cpu);
}
+out:
ack_APIC_irq();
cpu_clear(cpu, flush_cpumask);
-
-out:
put_cpu_no_resched();
}
@@ -283,6 +282,79 @@ struct call_data_struct {
static struct call_data_struct * call_data;
+void lock_ipi_call_lock(void)
+{
+ spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+ spin_unlock_irq(&call_lock);
+}
+
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ */
+static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = 1;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ wmb();
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (!wait)
+ return;
+
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ /* prevent preemption and reschedule on another processor */
+ int me = get_cpu();
+ if (cpu == me) {
+ WARN_ON(1);
+ put_cpu();
+ return -EBUSY;
+ }
+ spin_lock_bh(&call_lock);
+ __smp_call_function_single(cpu, func, info, nonatomic, wait);
+ spin_unlock_bh(&call_lock);
+ put_cpu();
+ return 0;
+}
+
/*
* this function sends a 'generic call function' IPI to all other CPUs
* in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f345941..fa25e39fe54d 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
* Andi Kleen : Converted to new state machine.
* Various cleanups.
* Probably mostly hotplug CPU ready now.
+ * Ashok Raj : CPU hotplug support
*/
@@ -58,11 +59,6 @@
#include <asm/proto.h>
#include <asm/nmi.h>
-/* Change for real CPU hotplug. Note other files need to be fixed
- first too. */
-#define __cpuinit __init
-#define __cpuinitdata __initdata
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
/* Package ID of each logical CPU */
@@ -103,6 +99,19 @@ EXPORT_SYMBOL(cpu_core_map);
extern unsigned char trampoline_data[];
extern unsigned char trampoline_end[];
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x) (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
+
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -202,9 +211,6 @@ static __cpuinit void sync_master(void *arg)
{
unsigned long flags, i;
- if (smp_processor_id() != boot_cpu_id)
- return;
-
go[MASTER] = 0;
local_irq_save(flags);
@@ -253,12 +259,12 @@ get_delta(long *rt, long *master)
return tcenter - best_tm;
}
-static __cpuinit void sync_tsc(void)
+static __cpuinit void sync_tsc(unsigned int master)
{
int i, done = 0;
long delta, adj, adjust_latency = 0;
unsigned long flags, rt, master_time_stamp, bound;
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
static struct syncdebug {
long rt; /* roundtrip time */
long master; /* master's timestamp */
@@ -267,9 +273,17 @@ static __cpuinit void sync_tsc(void)
} t[NUM_ROUNDS] __cpuinitdata;
#endif
+ printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
+ smp_processor_id(), master);
+
go[MASTER] = 1;
- smp_call_function(sync_master, NULL, 1, 0);
+ /* It is dangerous to broadcast IPI as cpus are coming up,
+ * as they may not be ready to accept them. So since
+ * we only need to send the ipi to the boot cpu direct
+ * the message, and avoid the race.
+ */
+ smp_call_function_single(master, sync_master, NULL, 1, 0);
while (go[MASTER]) /* wait for master to be ready */
no_cpu_relax();
@@ -294,7 +308,7 @@ static __cpuinit void sync_tsc(void)
rdtscll(t);
wrmsrl(MSR_IA32_TSC, t + adj);
}
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
t[i].rt = rt;
t[i].master = master_time_stamp;
t[i].diff = delta;
@@ -304,7 +318,7 @@ static __cpuinit void sync_tsc(void)
}
spin_unlock_irqrestore(&tsc_sync_lock, flags);
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
for (i = 0; i < NUM_ROUNDS; ++i)
printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
t[i].rt, t[i].master, t[i].diff, t[i].lat);
@@ -313,16 +327,14 @@ static __cpuinit void sync_tsc(void)
printk(KERN_INFO
"CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
"maxerr %lu cycles)\n",
- smp_processor_id(), boot_cpu_id, delta, rt);
+ smp_processor_id(), master, delta, rt);
}
static void __cpuinit tsc_sync_wait(void)
{
if (notscsync || !cpu_has_tsc)
return;
- printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
- boot_cpu_id);
- sync_tsc();
+ sync_tsc(0);
}
static __init int notscsync_setup(char *s)
@@ -418,6 +430,33 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+static inline void set_cpu_sibling_map(int cpu)
+{
+ int i;
+
+ if (smp_num_siblings > 1) {
+ for_each_cpu(i) {
+ if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ cpu_set(i, cpu_sibling_map[cpu]);
+ cpu_set(cpu, cpu_sibling_map[i]);
+ }
+ }
+ } else {
+ cpu_set(cpu, cpu_sibling_map[cpu]);
+ }
+
+ if (current_cpu_data.x86_num_cores > 1) {
+ for_each_cpu(i) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ }
+ }
+ } else {
+ cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ }
+}
+
/*
* Setup code on secondary processor (after comming out of the trampoline)
*/
@@ -448,15 +487,35 @@ void __cpuinit start_secondary(void)
enable_APIC_timer();
/*
+ * The sibling maps must be set before turing the online map on for
+ * this cpu
+ */
+ set_cpu_sibling_map(smp_processor_id());
+
+ /*
+ * Wait for TSC sync to not schedule things before.
+ * We still process interrupts, which could see an inconsistent
+ * time in that window unfortunately.
+ * Do this here because TSC sync has global unprotected state.
+ */
+ tsc_sync_wait();
+
+ /*
+ * We need to hold call_lock, so there is no inconsistency
+ * between the time smp_call_function() determines number of
+ * IPI receipients, and the time when the determination is made
+ * for which cpus receive the IPI in genapic_flat.c. Holding this
+ * lock helps us to not include this cpu in a currently in progress
+ * smp_call_function().
+ */
+ lock_ipi_call_lock();
+
+ /*
* Allow the master to continue.
*/
cpu_set(smp_processor_id(), cpu_online_map);
- mb();
-
- /* Wait for TSC sync to not schedule things before.
- We still process interrupts, which could see an inconsistent
- time in that window unfortunately. */
- tsc_sync_wait();
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ unlock_ipi_call_lock();
cpu_idle();
}
@@ -464,7 +523,7 @@ void __cpuinit start_secondary(void)
extern volatile unsigned long init_rsp;
extern void (*initial_code)(void);
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
static void inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
@@ -628,36 +687,81 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
return (send_status | accept_status);
}
+struct create_idle {
+ struct task_struct *idle;
+ struct completion done;
+ int cpu;
+};
+
+void do_fork_idle(void *_c_idle)
+{
+ struct create_idle *c_idle = _c_idle;
+
+ c_idle->idle = fork_idle(c_idle->cpu);
+ complete(&c_idle->done);
+}
+
/*
* Boot one CPU.
*/
static int __cpuinit do_boot_cpu(int cpu, int apicid)
{
- struct task_struct *idle;
unsigned long boot_error;
int timeout;
unsigned long start_rip;
+ struct create_idle c_idle = {
+ .cpu = cpu,
+ .done = COMPLETION_INITIALIZER(c_idle.done),
+ };
+ DECLARE_WORK(work, do_fork_idle, &c_idle);
+
+ c_idle.idle = get_idle_for_cpu(cpu);
+
+ if (c_idle.idle) {
+ c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
+ init_idle(c_idle.idle, cpu);
+ goto do_rest;
+ }
+
/*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
+ * During cold boot process, keventd thread is not spun up yet.
+ * When we do cpu hot-add, we create idle threads on the fly, we should
+ * not acquire any attributes from the calling context. Hence the clean
+ * way to create kernel_threads() is to do that from keventd().
+ * We do the current_is_keventd() due to the fact that ACPI notifier
+ * was also queuing to keventd() and when the caller is already running
+ * in context of keventd(), we would end up with locking up the keventd
+ * thread.
*/
- idle = fork_idle(cpu);
- if (IS_ERR(idle)) {
+ if (!keventd_up() || current_is_keventd())
+ work.func(work.data);
+ else {
+ schedule_work(&work);
+ wait_for_completion(&c_idle.done);
+ }
+
+ if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
- return PTR_ERR(idle);
+ return PTR_ERR(c_idle.idle);
}
- cpu_pda[cpu].pcurrent = idle;
+ set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+ cpu_pda[cpu].pcurrent = c_idle.idle;
start_rip = setup_trampoline();
- init_rsp = idle->thread.rsp;
+ init_rsp = c_idle.idle->thread.rsp;
per_cpu(init_tss,cpu).rsp0 = init_rsp;
initial_code = start_secondary;
- clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+ clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
- printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
- start_rip, init_rsp);
+ printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
+ cpus_weight(cpu_present_map),
+ apicid);
/*
* This grunge runs the startup process for
@@ -724,7 +828,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
else
/* trampoline code not run */
printk("Not responding.\n");
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
inquire_remote_apic(apicid);
#endif
}
@@ -746,51 +850,6 @@ cycles_t cacheflush_time;
unsigned long cache_decay_ticks;
/*
- * Construct cpu_sibling_map[], so that we can tell the sibling CPU
- * on SMT systems efficiently.
- */
-static __cpuinit void detect_siblings(void)
-{
- int cpu;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpus_clear(cpu_sibling_map[cpu]);
- cpus_clear(cpu_core_map[cpu]);
- }
-
- for_each_online_cpu (cpu) {
- struct cpuinfo_x86 *c = cpu_data + cpu;
- int siblings = 0;
- int i;
- if (smp_num_siblings > 1) {
- for_each_online_cpu (i) {
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
- siblings++;
- cpu_set(i, cpu_sibling_map[cpu]);
- }
- }
- } else {
- siblings++;
- cpu_set(cpu, cpu_sibling_map[cpu]);
- }
-
- if (siblings != smp_num_siblings) {
- printk(KERN_WARNING
- "WARNING: %d siblings found for CPU%d, should be %d\n",
- siblings, cpu, smp_num_siblings);
- smp_num_siblings = siblings;
- }
- if (c->x86_num_cores > 1) {
- for_each_online_cpu(i) {
- if (phys_proc_id[cpu] == phys_proc_id[i])
- cpu_set(i, cpu_core_map[cpu]);
- }
- } else
- cpu_core_map[cpu] = cpu_sibling_map[cpu];
- }
-}
-
-/*
* Cleanup possible dangling ends...
*/
static __cpuinit void smp_cleanup_boot(void)
@@ -823,7 +882,7 @@ static __cpuinit void smp_cleanup_boot(void)
*
* RED-PEN audit/test this more. I bet there is more state messed up here.
*/
-static __cpuinit void disable_smp(void)
+static __init void disable_smp(void)
{
cpu_present_map = cpumask_of_cpu(0);
cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +897,7 @@ static __cpuinit void disable_smp(void)
/*
* Handle user cpus=... parameter.
*/
-static __cpuinit void enforce_max_cpus(unsigned max_cpus)
+static __init void enforce_max_cpus(unsigned max_cpus)
{
int i, k;
k = 0;
@@ -852,10 +911,31 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
}
}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+static void prefill_possible_map(void)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_set(i, cpu_possible_map);
+}
+#endif
+
/*
* Various sanity checks.
*/
-static int __cpuinit smp_sanity_check(unsigned max_cpus)
+static int __init smp_sanity_check(unsigned max_cpus)
{
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,28 +993,17 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
* Prepare for SMP bootup. The MP table or ACPI has been read
* earlier. Just do some sanity checking here and enable APIC mode.
*/
-void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus(unsigned int max_cpus)
{
- int i;
-
nmi_watchdog_default();
current_cpu_data = boot_cpu_data;
current_thread_info()->cpu = 0; /* needed? */
enforce_max_cpus(max_cpus);
- /*
- * Fill in cpu_present_mask
- */
- for (i = 0; i < NR_CPUS; i++) {
- int apicid = cpu_present_to_apicid(i);
- if (physid_isset(apicid, phys_cpu_present_map)) {
- cpu_set(i, cpu_present_map);
- /* possible map would be different if we supported real
- CPU hotplug. */
- cpu_set(i, cpu_possible_map);
- }
- }
+#ifdef CONFIG_HOTPLUG_CPU
+ prefill_possible_map();
+#endif
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -978,13 +1047,13 @@ void __init smp_prepare_boot_cpu(void)
int me = smp_processor_id();
cpu_set(me, cpu_online_map);
cpu_set(me, cpu_callout_map);
+ cpu_set(0, cpu_sibling_map[0]);
+ cpu_set(0, cpu_core_map[0]);
+ per_cpu(cpu_state, me) = CPU_ONLINE;
}
/*
* Entry point to boot a CPU.
- *
- * This is all __cpuinit, not __devinit for now because we don't support
- * CPU hotplug (yet).
*/
int __cpuinit __cpu_up(unsigned int cpu)
{
@@ -1001,6 +1070,15 @@ int __cpuinit __cpu_up(unsigned int cpu)
return -EINVAL;
}
+ /*
+ * Already booted CPU?
+ */
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ Dprintk("do_boot_cpu %d Already started\n", cpu);
+ return -ENOSYS;
+ }
+
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Boot it! */
err = do_boot_cpu(cpu, apicid);
if (err < 0) {
@@ -1013,23 +1091,117 @@ int __cpuinit __cpu_up(unsigned int cpu)
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
- return 0;
+ err = 0;
+
+ return err;
}
/*
* Finish the SMP boot.
*/
-void __cpuinit smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(unsigned int max_cpus)
{
+#ifndef CONFIG_HOTPLUG_CPU
zap_low_mappings();
+#endif
smp_cleanup_boot();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
- detect_siblings();
time_init_gtod();
check_nmi_watchdog();
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+ int sibling;
+
+ for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+ cpu_clear(cpu, cpu_sibling_map[sibling]);
+ for_each_cpu_mask(sibling, cpu_core_map[cpu])
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ phys_proc_id[cpu] = BAD_APICID;
+ cpu_core_id[cpu] = BAD_APICID;
+}
+
+void remove_cpu_from_maps(void)
+{
+ int cpu = smp_processor_id();
+
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
+ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+}
+
+int __cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /*
+ * Perhaps use cpufreq to drop frequency, but that could go
+ * into generic code.
+ *
+ * We won't take down the boot processor on i386 due to some
+ * interrupts only being able to be serviced by the BSP.
+ * Especially so if we're not using an IOAPIC -zwane
+ */
+ if (cpu == 0)
+ return -EBUSY;
+
+ disable_APIC_timer();
+
+ /*
+ * HACK:
+ * Allow any queued timer interrupts to get serviced
+ * This is only a temporary solution until we cleanup
+ * fixup_irqs as we do for IA64.
+ */
+ local_irq_enable();
+ mdelay(1);
+
+ local_irq_disable();
+ remove_siblinginfo(cpu);
+
+ /* It's now safe to remove this processor from the online map */
+ cpu_clear(cpu, cpu_online_map);
+ remove_cpu_from_maps();
+ fixup_irqs(cpu_online_map);
+ return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We don't do anything here: idle task is faking death itself. */
+ unsigned int i;
+
+ for (i = 0; i < 10; i++) {
+ /* They ack this in play_dead by setting CPU_DEAD */
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ printk ("CPU %d is now offline\n", cpu);
+ return;
+ }
+ msleep(100);
+ }
+ printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d657..0612640d91b1 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
*/
asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
- asm volatile ("sldt %0" : "=m" (ctxt->ldt));
asm volatile ("str %0" : "=m" (ctxt->tr));
/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+ asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
}
void save_processor_state(void)
@@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt)
/*
* control registers
*/
+ asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
/*
+ * now restore the descriptor tables to their proper values
+ * ltr is done i fix_processor_context().
+ */
+ asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+ asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
+ /*
* segment registers
*/
asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
@@ -108,17 +116,10 @@ void __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_GS_BASE, ctxt->gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
- /*
- * now restore the descriptor tables to their proper values
- * ltr is done i fix_processor_context().
- */
- asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
- asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
- asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
fix_processor_context();
do_fpu_end();
+ mtrr_ap_init();
}
void restore_processor_state(void)
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index fb8c809b4cd9..66bf6ddeb0c3 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -64,6 +64,7 @@ static int notsc __initdata = 0;
unsigned int cpu_khz; /* TSC clocks / usec, not used here */
static unsigned long hpet_period; /* fsecs / HPET clock */
unsigned long hpet_tick; /* HPET clocks / interrupt */
+static int hpet_use_timer;
unsigned long vxtime_hz = PIT_TICK_RATE;
int report_lost_ticks; /* command line option */
unsigned long long monotonic_base;
@@ -105,7 +106,9 @@ static inline unsigned int do_gettimeoffset_tsc(void)
static inline unsigned int do_gettimeoffset_hpet(void)
{
- return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+ /* cap counter read to one tick to avoid inconsistencies */
+ unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
+ return (min(counter,hpet_tick) * vxtime.quot) >> 32;
}
unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -301,7 +304,7 @@ unsigned long long monotonic_clock(void)
last_offset = vxtime.last;
base = monotonic_base;
- this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+ this_offset = hpet_readl(HPET_COUNTER);
} while (read_seqretry(&xtime_lock, seq));
offset = (this_offset - last_offset);
@@ -377,7 +380,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
write_seqlock(&xtime_lock);
- if (vxtime.hpet_address) {
+ if (vxtime.hpet_address)
+ offset = hpet_readl(HPET_COUNTER);
+
+ if (hpet_use_timer) {
+ /* if we're using the hpet timer functionality,
+ * we can more accurately know the counter value
+ * when the timer interrupt occured.
+ */
offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
delay = hpet_readl(HPET_COUNTER) - offset;
} else {
@@ -803,17 +813,18 @@ static int hpet_timer_stop_set_go(unsigned long tick)
* Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
* and period also hpet_tick.
*/
-
- hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ if (hpet_use_timer) {
+ hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
HPET_TN_32BIT, HPET_T0_CFG);
- hpet_writel(hpet_tick, HPET_T0_CMP);
- hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
-
+ hpet_writel(hpet_tick, HPET_T0_CMP);
+ hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+ cfg |= HPET_CFG_LEGACY;
+ }
/*
* Go!
*/
- cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+ cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
return 0;
@@ -834,8 +845,7 @@ static int hpet_init(void)
id = hpet_readl(HPET_ID);
- if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
- !(id & HPET_ID_LEGSUP))
+ if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
return -1;
hpet_period = hpet_readl(HPET_PERIOD);
@@ -845,6 +855,8 @@ static int hpet_init(void)
hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
hpet_period;
+ hpet_use_timer = (id & HPET_ID_LEGSUP);
+
return hpet_timer_stop_set_go(hpet_tick);
}
@@ -901,9 +913,11 @@ void __init time_init(void)
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
- if (!hpet_init()) {
+ if (!hpet_init())
vxtime_hz = (1000000000000000L + hpet_period / 2) /
hpet_period;
+
+ if (hpet_use_timer) {
cpu_khz = hpet_calibrate_tsc();
timename = "HPET";
#ifdef CONFIG_X86_PM_TIMER
@@ -968,7 +982,7 @@ void __init time_init_gtod(void)
if (unsynchronized_tsc())
notsc = 1;
if (vxtime.hpet_address && notsc) {
- timetype = "HPET";
+ timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
vxtime.mode = VXTIME_HPET;
do_gettimeoffset = do_gettimeoffset_hpet;
@@ -983,7 +997,7 @@ void __init time_init_gtod(void)
printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
#endif
} else {
- timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
+ timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
vxtime.mode = VXTIME_TSC;
}
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 3dfec8fdabcd..6ead433a3885 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -274,7 +274,7 @@ EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
{
int i;
- int in_kernel = (regs->cs & 3) == 0;
+ int in_kernel = !user_mode(regs);
unsigned long rsp;
const int cpu = safe_smp_processor_id();
struct task_struct *cur = cpu_pda[cpu].pcurrent;
@@ -318,7 +318,7 @@ void handle_BUG(struct pt_regs *regs)
struct bug_frame f;
char tmp;
- if (regs->cs & 3)
+ if (user_mode(regs))
return;
if (__copy_from_user(&f, (struct bug_frame *) regs->rip,
sizeof(struct bug_frame)))
@@ -437,7 +437,7 @@ static void do_trap(int trapnr, int signr, char *str,
}
#endif
- if ((regs->cs & 3) != 0) {
+ if (user_mode(regs)) {
struct task_struct *tsk = current;
if (exception_trace && unhandled_signal(tsk, signr))
@@ -522,7 +522,7 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
}
#endif
- if ((regs->cs & 3)!=0) {
+ if (user_mode(regs)) {
struct task_struct *tsk = current;
if (exception_trace && unhandled_signal(tsk, SIGSEGV))
@@ -586,9 +586,12 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
asmlinkage void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
+ int cpu;
+
+ cpu = smp_processor_id();
/* Only the BSP gets external NMIs from the system. */
- if (!smp_processor_id())
+ if (!cpu)
reason = get_nmi_reason();
if (!(reason & 0xc0)) {
@@ -638,7 +641,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs)
if (eregs == (struct pt_regs *)eregs->rsp)
;
/* Exception from user space */
- else if (eregs->cs & 3)
+ else if (user_mode(eregs))
regs = ((struct pt_regs *)current->thread.rsp0) - 1;
/* Exception from kernel and interrupts are enabled. Move to
kernel process stack. */
@@ -669,7 +672,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
}
#endif
- asm("movq %%db6,%0" : "=r" (condition));
+ get_debugreg(condition, 6);
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
SIGTRAP) == NOTIFY_STOP)
@@ -697,7 +700,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
* allowing programs to debug themselves without the ptrace()
* interface.
*/
- if ((regs->cs & 3) == 0)
+ if (!user_mode(regs))
goto clear_TF_reenable;
/*
* Was the TF flag set by a debugger? If so, clear it now,
@@ -715,13 +718,13 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
info.si_signo = SIGTRAP;
info.si_errno = 0;
info.si_code = TRAP_BRKPT;
- if ((regs->cs & 3) == 0)
+ if (!user_mode(regs))
goto clear_dr7;
info.si_addr = (void __user *)regs->rip;
force_sig_info(SIGTRAP, &info, tsk);
clear_dr7:
- asm volatile("movq %0,%%db7"::"r"(0UL));
+ set_debugreg(0UL, 7);
return;
clear_TF_reenable:
@@ -756,7 +759,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
unsigned short cwd, swd;
conditional_sti(regs);
- if ((regs->cs & 3) == 0 &&
+ if (!user_mode(regs) &&
kernel_math_error(regs, "kernel x87 math error"))
return;
@@ -822,7 +825,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
unsigned short mxcsr;
conditional_sti(regs);
- if ((regs->cs & 3) == 0 &&
+ if (!user_mode(regs) &&
kernel_math_error(regs, "kernel simd math error"))
return;
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 59ebd5beda87..2a94f9b60b2d 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -2,7 +2,10 @@
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
*/
+#define LOAD_OFFSET __START_KERNEL_map
+
#include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
#include <linux/config.h>
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
SECTIONS
{
- . = 0xffffffff80100000;
+ . = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
_text = .; /* Text and read-only data */
- .text : {
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
*(.text)
SCHED_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
} = 0x9090
- .text.lock : { *(.text.lock) } /* out-of-line lock text */
+ /* out-of-line lock text */
+ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
_etext = .; /* End of text section */
. = ALIGN(16); /* Exception table */
__start___ex_table = .;
- __ex_table : { *(__ex_table) }
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
__stop___ex_table = .;
RODATA
- .data : { /* Data */
+ /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
*(.data)
CONSTRUCTORS
}
@@ -40,62 +45,99 @@ SECTIONS
_edata = .; /* End of data section */
__bss_start = .; /* BSS */
- .bss : {
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
*(.bss.page_aligned)
*(.bss)
}
__bss_end = .;
+ . = ALIGN(PAGE_SIZE);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ *(.data.cacheline_aligned)
+ }
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+ .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+ *(.data.read_mostly)
+ }
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
-#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1))
-#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+ . = VSYSCALL_ADDR;
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
- .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
- __vsyscall_0 = LOADADDR(.vsyscall_0);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
- xtime_lock = LOADADDR(.xtime_lock);
- .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
- vxtime = LOADADDR(.vxtime);
- .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
- wall_jiffies = LOADADDR(.wall_jiffies);
- .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
- sys_tz = LOADADDR(.sys_tz);
- .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
- sysctl_vsyscall = LOADADDR(.sysctl_vsyscall);
- .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
- xtime = LOADADDR(.xtime);
+ .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
+ xtime_lock = VVIRT(.xtime_lock);
+
+ .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
+ vxtime = VVIRT(.vxtime);
+
+ .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
+ wall_jiffies = VVIRT(.wall_jiffies);
+
+ .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
+ sys_tz = VVIRT(.sys_tz);
+
+ .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
+ sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
+
+ .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
+ xtime = VVIRT(.xtime);
+
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
- jiffies = LOADADDR(.jiffies);
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
- . = LOADADDR(.vsyscall_0) + 4096;
+ .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+ jiffies = VVIRT(.jiffies);
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+
+ . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
. = ALIGN(8192); /* init_task */
- .data.init_task : { *(.data.init_task) }
+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ *(.data.init_task)
+ }
. = ALIGN(4096);
- .data.page_aligned : { *(.data.page_aligned) }
+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ *(.data.page_aligned)
+ }
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
- .init.text : {
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
*(.init.text)
_einittext = .;
}
__initdata_begin = .;
- .init.data : { *(.init.data) }
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
__initdata_end = .;
. = ALIGN(16);
__setup_start = .;
- .init.setup : { *(.init.setup) }
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
__setup_end = .;
__initcall_start = .;
- .initcall.init : {
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
*(.initcall1.init)
*(.initcall2.init)
*(.initcall3.init)
@@ -106,32 +148,38 @@ SECTIONS
}
__initcall_end = .;
__con_initcall_start = .;
- .con_initcall.init : { *(.con_initcall.init) }
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ *(.con_initcall.init)
+ }
__con_initcall_end = .;
SECURITY_INIT
. = ALIGN(8);
__alt_instructions = .;
- .altinstructions : { *(.altinstructions) }
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ *(.altinstructions)
+ }
__alt_instructions_end = .;
- .altinstr_replacement : { *(.altinstr_replacement) }
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
- .exit.text : { *(.exit.text) }
- .exit.data : { *(.exit.data) }
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
. = ALIGN(4096);
__initramfs_start = .;
- .init.ramfs : { *(.init.ramfs) }
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
. = ALIGN(32);
__per_cpu_start = .;
- .data.percpu : { *(.data.percpu) }
+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
__per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;
. = ALIGN(4096);
__nosave_begin = .;
- .data_nosave : { *(.data.nosave) }
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
. = ALIGN(4096);
__nosave_end = .;