summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/alternative.c39
-rw-r--r--arch/x86/kernel/apic_32.c3
-rw-r--r--arch/x86/kernel/apic_64.c3
-rw-r--r--arch/x86/kernel/apm_32.c19
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c18
-rw-r--r--arch/x86/kernel/cpu/nexgen.c59
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c14
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/e820_64.c35
-rw-r--r--arch/x86/kernel/entry_32.S12
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/head64.c25
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i8253.c6
-rw-r--r--arch/x86/kernel/io_apic_32.c3
-rw-r--r--arch/x86/kernel/io_apic_64.c2
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/kdebugfs.c163
-rw-r--r--arch/x86/kernel/kvm.c248
-rw-r--r--arch/x86/kernel/kvmclock.c187
-rw-r--r--arch/x86/kernel/mfgpt_32.c11
-rw-r--r--arch/x86/kernel/mpparse.c39
-rw-r--r--arch/x86/kernel/paravirt.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1
-rw-r--r--arch/x86/kernel/process.c117
-rw-r--r--arch/x86/kernel/process_32.c118
-rw-r--r--arch/x86/kernel/process_64.c123
-rw-r--r--arch/x86/kernel/ptrace.c95
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup_32.c10
-rw-r--r--arch/x86/kernel/setup_64.c36
-rw-r--r--arch/x86/kernel/signal_32.c35
-rw-r--r--arch/x86/kernel/signal_64.c30
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/summit_32.c5
-rw-r--r--arch/x86/kernel/tlb_64.c4
-rw-r--r--arch/x86/kernel/trampoline_32.S2
-rw-r--r--arch/x86/kernel/traps_32.c2
-rw-r--r--arch/x86/kernel/vmi_32.c22
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S6
46 files changed, 988 insertions, 576 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90e092d0af0c..fa19c3819540 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o
+obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 057ccf1d5ad4..977ed5cdeaa3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -697,10 +697,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
#define HPET_RESOURCE_NAME_SIZE 9
hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
- if (!hpet_res)
- return 0;
-
- memset(hpet_res, 0, sizeof(*hpet_res));
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
new file mode 100644
index 000000000000..58f1f48a58f8
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/.gitignore
@@ -0,0 +1,3 @@
+wakeup.bin
+wakeup.elf
+wakeup.lds
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index df4099dc1c68..65c7857a90dd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -511,31 +511,30 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
unsigned long flags;
char *vaddr;
int nr_pages = 2;
+ struct page *pages[2];
+ int i;
- BUG_ON(len > sizeof(long));
- BUG_ON((((long)addr + len - 1) & ~(sizeof(long) - 1))
- - ((long)addr & ~(sizeof(long) - 1)));
- if (kernel_text_address((unsigned long)addr)) {
- struct page *pages[2] = { virt_to_page(addr),
- virt_to_page(addr + PAGE_SIZE) };
- if (!pages[1])
- nr_pages = 1;
- vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- BUG_ON(!vaddr);
- local_irq_save(flags);
- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- local_irq_restore(flags);
- vunmap(vaddr);
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
} else {
- /*
- * modules are in vmalloc'ed memory, always writable.
- */
- local_irq_save(flags);
- memcpy(addr, opcode, len);
- local_irq_restore(flags);
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
}
+ BUG_ON(!pages[0]);
+ if (!pages[1])
+ nr_pages = 1;
+ vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ BUG_ON(!vaddr);
+ local_irq_save(flags);
+ memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
+ local_irq_restore(flags);
+ vunmap(vaddr);
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
+ for (i = 0; i < len; i++)
+ BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
return addr;
}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 8317401170b8..4b99b1bdeb6c 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -451,7 +451,8 @@ void __init setup_boot_APIC_clock(void)
}
/* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
+ lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index bf83157337e4..5910020c3f24 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -360,7 +360,8 @@ static void __init calibrate_APIC_clock(void)
result / 1000 / 1000, result / 1000 % 1000);
/* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
+ lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
+ lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f0030a0999c7..bf9290e29013 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -904,6 +904,7 @@ recalc:
original_pm_idle();
else
default_idle();
+ local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
goto recalc;
@@ -911,6 +912,8 @@ recalc:
if (apm_idle_done)
apm_do_busy();
+
+ local_irq_enable();
}
/**
@@ -1189,19 +1192,6 @@ static int suspend(int vetoable)
int err;
struct apm_user *as;
- if (pm_send_all(PM_SUSPEND, (void *)3)) {
- /* Vetoed */
- if (vetoable) {
- if (apm_info.connection_version > 0x100)
- set_system_power_state(APM_STATE_REJECT);
- err = -EBUSY;
- ignore_sys_suspend = 0;
- printk(KERN_WARNING "apm: suspend was vetoed.\n");
- goto out;
- }
- printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
- }
-
device_suspend(PMSG_SUSPEND);
local_irq_disable();
device_power_down(PMSG_SUSPEND);
@@ -1224,9 +1214,7 @@ static int suspend(int vetoable)
device_power_up();
local_irq_enable();
device_resume();
- pm_send_all(PM_RESUME, (void *)0);
queue_event(APM_NORMAL_RESUME, NULL);
- out:
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
as->suspend_wait = 0;
@@ -1337,7 +1325,6 @@ static void check_events(void)
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
device_resume();
- pm_send_all(PM_RESUME, (void *)0);
queue_event(event, NULL);
}
ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee7c45235e54..a0c6f8190887 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -11,7 +11,6 @@ obj-$(CONFIG_X86_32) += cyrix.o
obj-$(CONFIG_X86_32) += centaur.o
obj-$(CONFIG_X86_32) += transmeta.o
obj-$(CONFIG_X86_32) += intel.o
-obj-$(CONFIG_X86_32) += nexgen.o
obj-$(CONFIG_X86_32) += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 0173065dc3b7..245866828294 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -343,10 +343,4 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
.c_size_cache = amd_size_cache,
};
-int __init amd_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
- return 0;
-}
-
cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index e2d870de837c..8db8f73503b3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -339,6 +339,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
unsigned int freq;
+ unsigned int cached_freq;
dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -347,7 +348,16 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
return 0;
}
+ cached_freq = data->freq_table[data->acpi_data->state].frequency;
freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
+ if (freq != cached_freq) {
+ /*
+ * The dreaded BIOS frequency change behind our back.
+ * Force set the frequency on next target call.
+ */
+ data->resume = 1;
+ }
+
dprintk("cur freq = %u\n", freq);
return freq;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 9a699ed03598..e07e8c068ae0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -49,7 +49,7 @@ static int banks;
static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long notify_user;
static int rip_msr;
-static int mce_bootlog = 1;
+static int mce_bootlog = -1;
static atomic_t mce_events;
static char trigger[128];
@@ -471,13 +471,15 @@ static void mce_init(void *dummy)
static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
- /* disable GART TBL walk error reporting, which trips off
- incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, &bank[4]);
- /* Lots of broken BIOS around that don't clear them
- by default and leave crap in there. Don't log. */
- mce_bootlog = 0;
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if(c->x86 == 15)
+ /* disable GART TBL walk error reporting, which trips off
+ incorrectly with the IOMMU & 3ware & Cerberus. */
+ clear_bit(10, &bank[4]);
+ if(c->x86 <= 17 && mce_bootlog < 0)
+ /* Lots of broken BIOS around that don't clear them
+ by default and leave crap in there. Don't log. */
+ mce_bootlog = 0;
}
}
diff --git a/arch/x86/kernel/cpu/nexgen.c b/arch/x86/kernel/cpu/nexgen.c
deleted file mode 100644
index 5d5e1c134123..000000000000
--- a/arch/x86/kernel/cpu/nexgen.c
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-/*
- * Detect a NexGen CPU running without BIOS hypercode new enough
- * to have CPUID. (Thanks to Herbert Oppmann)
- */
-
-static int __cpuinit deep_magic_nexgen_probe(void)
-{
- int ret;
-
- __asm__ __volatile__ (
- " movw $0x5555, %%ax\n"
- " xorw %%dx,%%dx\n"
- " movw $2, %%cx\n"
- " divw %%cx\n"
- " movl $0, %%eax\n"
- " jnz 1f\n"
- " movl $1, %%eax\n"
- "1:\n"
- : "=a" (ret) : : "cx", "dx");
- return ret;
-}
-
-static void __cpuinit init_nexgen(struct cpuinfo_x86 *c)
-{
- c->x86_cache_size = 256; /* A few had 1 MB... */
-}
-
-static void __cpuinit nexgen_identify(struct cpuinfo_x86 *c)
-{
- /* Detect NexGen with old hypercode */
- if (deep_magic_nexgen_probe())
- strcpy(c->x86_vendor_id, "NexGenDriven");
-}
-
-static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
- .c_vendor = "Nexgen",
- .c_ident = { "NexGenDriven" },
- .c_models = {
- { .vendor = X86_VENDOR_NEXGEN,
- .family = 5,
- .model_names = { [1] = "Nx586" }
- },
- },
- .c_init = init_nexgen,
- .c_identify = nexgen_identify,
-};
-
-int __init nexgen_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index b943e10ad814..f9ae93adffe5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -614,16 +614,6 @@ static struct wd_ops intel_arch_wd_ops __read_mostly = {
.evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
};
-static struct wd_ops coreduo_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
-};
-
static void probe_nmi_watchdog(void)
{
switch (boot_cpu_data.x86_vendor) {
@@ -637,8 +627,8 @@ static void probe_nmi_watchdog(void)
/* Work around Core Duo (Yonah) errata AE49 where perfctr1
doesn't have a working enable bit. */
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
- wd_ops = &coreduo_wd_ops;
- break;
+ intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
+ intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
}
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
wd_ops = &intel_arch_wd_ops;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2251d0ae9570..268553817909 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
#include <asm/hpet.h>
#include <linux/kdebug.h>
#include <asm/smp.h>
+#include <asm/reboot.h>
#include <mach_ipi.h>
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void)
}
#endif
-void machine_crash_shutdown(struct pt_regs *regs)
+void native_machine_crash_shutdown(struct pt_regs *regs)
{
/* This function is only called after the system
* has panicked or is otherwise in a critical state.
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index cbd42e51cb08..645ee5e32a27 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -84,14 +84,41 @@ void __init reserve_early(unsigned long start, unsigned long end, char *name)
strncpy(r->name, name, sizeof(r->name) - 1);
}
-void __init early_res_to_bootmem(void)
+void __init free_early(unsigned long start, unsigned long end)
+{
+ struct early_res *r;
+ int i, j;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (start == r->start && end == r->end)
+ break;
+ }
+ if (i >= MAX_EARLY_RES || !early_res[i].end)
+ panic("free_early on not reserved area: %lx-%lx!", start, end);
+
+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ ;
+
+ memcpy(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
{
int i;
+ unsigned long final_start, final_end;
for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
struct early_res *r = &early_res[i];
- printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
- r->start, r->end - 1, r->name);
- reserve_bootmem_generic(r->start, r->end - r->start);
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end)
+ continue;
+ printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
+ final_start, final_end - 1, r->name);
+ reserve_bootmem_generic(final_start, final_end - final_start);
}
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0f8934fc303..2a609dc3271c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -409,7 +409,7 @@ restore_nocheck_notrace:
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
-iret_exc:
+ENTRY(iret_exc)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper)
ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
+/* Xen doesn't set %esp to be precisely what the normal sysenter
+ entrypoint expects, so fix it up before using the normal path. */
+ENTRY(xen_sysenter_target)
+ RING0_INT_FRAME
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp sysenter_past_esp
+
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
@@ -1035,8 +1042,9 @@ ENTRY(xen_hypervisor_callback)
cmpl $xen_iret_end_crit,%eax
jae 1f
- call xen_iret_crit_fixup
+ jmp xen_iret_crit_fixup
+ENTRY(xen_do_upcall)
1: mov %esp, %eax
call xen_evtchn_do_upcall
jmp ret_from_intr
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 9546ef408b92..021624c83583 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
else
#endif
- if (cpus_weight(cpu_possible_map) <= 8)
+ if (num_possible_cpus() <= 8)
genapic = &apic_flat;
else
genapic = &apic_physflat;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 993c76773256..e25c57b8aa84 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -11,6 +11,7 @@
#include <linux/string.h>
#include <linux/percpu.h>
#include <linux/start_kernel.h>
+#include <linux/io.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -22,6 +23,7 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/e820.h>
+#include <asm/bios_ebda.h>
static void __init zap_identity_mappings(void)
{
@@ -49,7 +51,6 @@ static void __init copy_bootdata(char *real_mode_data)
}
}
-#define BIOS_EBDA_SEGMENT 0x40E
#define BIOS_LOWMEM_KILOBYTES 0x413
/*
@@ -80,8 +81,7 @@ static void __init reserve_ebda_region(void)
lowmem <<= 10;
/* start of EBDA area */
- ebda_addr = *(unsigned short *)__va(BIOS_EBDA_SEGMENT);
- ebda_addr <<= 4;
+ ebda_addr = get_bios_ebda();
/* Fixup: bios puts an EBDA in the top 64K segment */
/* of conventional memory, but does not adjust lowmem. */
@@ -101,6 +101,24 @@ static void __init reserve_ebda_region(void)
reserve_early(lowmem, 0x100000, "BIOS reserved");
}
+static void __init reserve_setup_data(void)
+{
+ struct setup_data *data;
+ unsigned long pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+}
+
void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
@@ -157,6 +175,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
#endif
reserve_ebda_region();
+ reserve_setup_data();
/*
* At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 36652ea1a265..9007f9ea64ee 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -218,7 +218,7 @@ static void hpet_legacy_clockevent_register(void)
hpet_freq = 1000000000000000ULL;
do_div(hpet_freq, hpet_period);
hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, 32);
+ NSEC_PER_SEC, hpet_clockevent.shift);
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 8540abe86ade..c1b5e3ece1f2 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -115,7 +115,8 @@ void __init setup_pit_timer(void)
* IO_APIC has been initialized.
*/
pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
- pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+ pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
+ pit_clockevent.shift);
pit_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFF, &pit_clockevent);
pit_clockevent.min_delta_ns =
@@ -224,7 +225,8 @@ static int __init init_pit_clocksource(void)
pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
- clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+ clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE,
+ clocksource_pit.shift);
return clocksource_register(&clocksource_pit);
}
arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 2e2f42074e18..a40d54fc1fdd 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2068,7 +2068,7 @@ static void __init setup_nmi(void)
* cycles as some i82489DX-based boards have glue logic that keeps the
* 8259A interrupt line asserted until INTA. --macro
*/
-static inline void unlock_ExtINT_logic(void)
+static inline void __init unlock_ExtINT_logic(void)
{
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
@@ -2444,6 +2444,7 @@ void destroy_irq(unsigned int irq)
dynamic_irq_cleanup(irq);
spin_lock_irqsave(&vector_lock, flags);
+ clear_bit(irq_vector[irq], used_vectors);
irq_vector[irq] = 0;
spin_unlock_irqrestore(&vector_lock, flags);
}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 9ba11d07920f..ef1a8dfcc529 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1599,7 +1599,7 @@ static void __init setup_nmi(void)
* cycles as some i82489DX-based boards have glue logic that keeps the
* 8259A interrupt line asserted until INTA. --macro
*/
-static inline void unlock_ExtINT_logic(void)
+static inline void __init unlock_ExtINT_logic(void)
{
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 6ea67b76a214..00bda7bcda63 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -134,7 +134,7 @@ unsigned int do_IRQ(struct pt_regs *regs)
: "=a" (arg1), "=d" (arg2), "=b" (bx)
: "0" (irq), "1" (desc), "2" (isp),
"D" (desc->handle_irq)
- : "memory", "cc"
+ : "memory", "cc", "ecx"
);
} else
#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 73354302fda7..c03205991718 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -6,23 +6,171 @@
*
* This file is released under the GPLv2.
*/
-
#include <linux/debugfs.h>
+#include <linux/uaccess.h>
#include <linux/stat.h>
#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/mm.h>
#include <asm/setup.h>
#ifdef CONFIG_DEBUG_BOOT_PARAMS
+struct setup_data_node {
+ u64 paddr;
+ u32 type;
+ u32 len;
+};
+
+static ssize_t
+setup_data_read(struct file *file, char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ struct setup_data_node *node = file->private_data;
+ unsigned long remain;
+ loff_t pos = *ppos;
+ struct page *pg;
+ void *p;
+ u64 pa;
+
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= node->len)
+ return 0;
+
+ if (count > node->len - pos)
+ count = node->len - pos;
+ pa = node->paddr + sizeof(struct setup_data) + pos;
+ pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
+ if (PageHighMem(pg)) {
+ p = ioremap_cache(pa, count);
+ if (!p)
+ return -ENXIO;
+ } else {
+ p = __va(pa);
+ }
+
+ remain = copy_to_user(user_buf, p, count);
+
+ if (PageHighMem(pg))
+ iounmap(p);
+
+ if (remain)
+ return -EFAULT;
+
+ *ppos = pos + count;
+
+ return count;
+}
+
+static int setup_data_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static const struct file_operations fops_setup_data = {
+ .read = setup_data_read,
+ .open = setup_data_open,
+};
+
+static int __init
+create_setup_data_node(struct dentry *parent, int no,
+ struct setup_data_node *node)
+{
+ struct dentry *d, *type, *data;
+ char buf[16];
+ int error;
+
+ sprintf(buf, "%d", no);
+ d = debugfs_create_dir(buf, parent);
+ if (!d) {
+ error = -ENOMEM;
+ goto err_return;
+ }
+ type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
+ if (!type) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+ data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
+ if (!data) {
+ error = -ENOMEM;
+ goto err_type;
+ }
+ return 0;
+
+err_type:
+ debugfs_remove(type);
+err_dir:
+ debugfs_remove(d);
+err_return:
+ return error;
+}
+
+static int __init create_setup_data_nodes(struct dentry *parent)
+{
+ struct setup_data_node *node;
+ struct setup_data *data;
+ int error, no = 0;
+ struct dentry *d;
+ struct page *pg;
+ u64 pa_data;
+
+ d = debugfs_create_dir("setup_data", parent);
+ if (!d) {
+ error = -ENOMEM;
+ goto err_return;
+ }
+
+ pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+ pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
+ if (PageHighMem(pg)) {
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data) {
+ error = -ENXIO;
+ goto err_dir;
+ }
+ } else {
+ data = __va(pa_data);
+ }
+
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ error = create_setup_data_node(d, no, node);
+ pa_data = data->next;
+
+ if (PageHighMem(pg))
+ iounmap(data);
+ if (error)
+ goto err_dir;
+ no++;
+ }
+ return 0;
+
+err_dir:
+ debugfs_remove(d);
+err_return:
+ return error;
+}
+
static struct debugfs_blob_wrapper boot_params_blob = {
- .data = &boot_params,
- .size = sizeof(boot_params),
+ .data = &boot_params,
+ .size = sizeof(boot_params),
};
static int __init boot_params_kdebugfs_init(void)
{
- int error;
struct dentry *dbp, *version, *data;
+ int error;
dbp = debugfs_create_dir("boot_params", NULL);
if (!dbp) {
@@ -41,7 +189,13 @@ static int __init boot_params_kdebugfs_init(void)
error = -ENOMEM;
goto err_version;
}
+ error = create_setup_data_nodes(dbp);
+ if (error)
+ goto err_data;
return 0;
+
+err_data:
+ debugfs_remove(data);
err_version:
debugfs_remove(version);
err_dir:
@@ -61,5 +215,4 @@ static int __init arch_kdebugfs_init(void)
return error;
}
-
arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000000..8b7a3cf37d2b
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ * Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+
+#define MMU_QUEUE_SIZE 1024
+
+struct kvm_para_state {
+ u8 mmu_queue[MMU_QUEUE_SIZE];
+ int mmu_queue_len;
+ enum paravirt_lazy_mode mode;
+};
+
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+
+static struct kvm_para_state *kvm_para_state(void)
+{
+ return &per_cpu(para_state, raw_smp_processor_id());
+}
+
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static void kvm_mmu_op(void *buffer, unsigned len)
+{
+ int r;
+ unsigned long a1, a2;
+
+ do {
+ a1 = __pa(buffer);
+ a2 = 0; /* on i386 __pa() always returns <4G */
+ r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
+ buffer += r;
+ len -= r;
+ } while (len);
+}
+
+static void mmu_queue_flush(struct kvm_para_state *state)
+{
+ if (state->mmu_queue_len) {
+ kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
+ state->mmu_queue_len = 0;
+ }
+}
+
+static void kvm_deferred_mmu_op(void *buffer, int len)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ if (state->mode != PARAVIRT_LAZY_MMU) {
+ kvm_mmu_op(buffer, len);
+ return;
+ }
+ if (state->mmu_queue_len + len > sizeof state->mmu_queue)
+ mmu_queue_flush(state);
+ memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
+ state->mmu_queue_len += len;
+}
+
+static void kvm_mmu_write(void *dest, u64 val)
+{
+ __u64 pte_phys;
+ struct kvm_mmu_op_write_pte wpte;
+
+#ifdef CONFIG_HIGHPTE
+ struct page *page;
+ unsigned long dst = (unsigned long) dest;
+
+ page = kmap_atomic_to_page(dest);
+ pte_phys = page_to_pfn(page);
+ pte_phys <<= PAGE_SHIFT;
+ pte_phys += (dst & ~(PAGE_MASK));
+#else
+ pte_phys = (unsigned long)__pa(dest);
+#endif
+ wpte.header.op = KVM_MMU_OP_WRITE_PTE;
+ wpte.pte_val = val;
+ wpte.pte_phys = pte_phys;
+
+ kvm_deferred_mmu_op(&wpte, sizeof wpte);
+}
+
+/*
+ * We only need to hook operations that are MMU writes. We hook these so that
+ * we can use lazy MMU mode to batch these operations. We could probably
+ * improve the performance of the host code if we used some of the information
+ * here to simplify processing of batched writes.
+ */
+static void kvm_set_pte(pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ kvm_mmu_write(pmdp, pmd_val(pmd));
+}
+
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ kvm_mmu_write(ptep, 0);
+}
+
+static void kvm_pmd_clear(pmd_t *pmdp)
+{
+ kvm_mmu_write(pmdp, 0);
+}
+#endif
+
+static void kvm_set_pud(pud_t *pudp, pud_t pud)
+{
+ kvm_mmu_write(pudp, pud_val(pud));
+}
+
+#if PAGETABLE_LEVELS == 4
+static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ kvm_mmu_write(pgdp, pgd_val(pgd));
+}
+#endif
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static void kvm_flush_tlb(void)
+{
+ struct kvm_mmu_op_flush_tlb ftlb = {
+ .header.op = KVM_MMU_OP_FLUSH_TLB,
+ };
+
+ kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+}
+
+static void kvm_release_pt(u32 pfn)
+{
+ struct kvm_mmu_op_release_pt rpt = {
+ .header.op = KVM_MMU_OP_RELEASE_PT,
+ .pt_phys = (u64)pfn << PAGE_SHIFT,
+ };
+
+ kvm_mmu_op(&rpt, sizeof rpt);
+}
+
+static void kvm_enter_lazy_mmu(void)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ paravirt_enter_lazy_mmu();
+ state->mode = paravirt_get_lazy_mode();
+}
+
+static void kvm_leave_lazy_mmu(void)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ mmu_queue_flush(state);
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ state->mode = paravirt_get_lazy_mode();
+}
+
+static void paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+ pv_info.paravirt_enabled = 1;
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_cpu_ops.io_delay = kvm_io_delay;
+
+ if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
+ pv_mmu_ops.set_pte = kvm_set_pte;
+ pv_mmu_ops.set_pte_at = kvm_set_pte_at;
+ pv_mmu_ops.set_pmd = kvm_set_pmd;
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
+ pv_mmu_ops.set_pte_present = kvm_set_pte_present;
+ pv_mmu_ops.pte_clear = kvm_pte_clear;
+ pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+#endif
+ pv_mmu_ops.set_pud = kvm_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = kvm_set_pgd;
+#endif
+#endif
+ pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
+ pv_mmu_ops.release_pte = kvm_release_pt;
+ pv_mmu_ops.release_pmd = kvm_release_pt;
+ pv_mmu_ops.release_pud = kvm_release_pt;
+
+ pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
+ pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+ }
+}
+
+void __init kvm_guest_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ paravirt_ops_setup();
+}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000000..ddee04043aeb
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
+/* KVM paravirtual clock driver. A clocksource implementation
+ Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+#include <asm/reboot.h>
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+ kvmclock = 0;
+ return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+/* The hypervisor will put information about time periodically here */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+ int cpu = smp_processor_id();
+ u64 delta = native_read_tsc() - last_tsc;
+ return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+ u32 wc_sec, wc_nsec;
+ u64 delta;
+ struct timespec ts;
+ int version, nsec;
+ int low, high;
+
+ low = (int)__pa(&wall_clock);
+ high = ((u64)__pa(&wall_clock) >> 32);
+
+ delta = kvm_clock_read();
+
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+ do {
+ version = wall_clock.wc_version;
+ rmb();
+ wc_sec = wall_clock.wc_sec;
+ wc_nsec = wall_clock.wc_nsec;
+ rmb();
+ } while ((wall_clock.wc_version != version) || (version & 1));
+
+ delta = kvm_clock_read() - delta;
+ delta += wc_nsec;
+ nsec = do_div(delta, NSEC_PER_SEC);
+ set_normalized_timespec(&ts, wc_sec + delta, nsec);
+ /*
+ * Of all mechanisms of time adjustment I've tested, this one
+ * was the champion!
+ */
+ return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+ return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
+static cycle_t kvm_clock_read(void)
+{
+ u64 last_tsc, now;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ last_tsc = get_clock(cpu, tsc_timestamp);
+ now = get_clock(cpu, system_time);
+
+ now += kvm_get_delta(last_tsc);
+ preempt_enable();
+
+ return now;
+}
+static struct clocksource kvm_clock = {
+ .name = "kvm-clock",
+ .read = kvm_clock_read,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 1 << KVM_SCALE,
+ .shift = KVM_SCALE,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int kvm_register_clock(void)
+{
+ int cpu = smp_processor_id();
+ int low, high;
+ low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
+ high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+
+ return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+}
+
+static void kvm_setup_secondary_clock(void)
+{
+ /*
+ * Now that the first cpu already had this clocksource initialized,
+ * we shouldn't fail.
+ */
+ WARN_ON(kvm_register_clock());
+ /* ok, done with our trickery, call native */
+ setup_secondary_APIC_clock();
+}
+
+/*
+ * After the clock is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written. So before any
+ * kind of shutdown from our side, we unregister the clock by writting anything
+ * that does not have the 'enable' bit set in the msr
+ */
+#ifdef CONFIG_KEXEC
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_machine_crash_shutdown(regs);
+}
+#endif
+
+static void kvm_shutdown(void)
+{
+ native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_machine_shutdown();
+}
+
+void __init kvmclock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ if (kvm_register_clock())
+ return;
+ pv_time_ops.get_wallclock = kvm_get_wallclock;
+ pv_time_ops.set_wallclock = kvm_set_wallclock;
+ pv_time_ops.sched_clock = kvm_clock_read;
+ pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+ machine_ops.shutdown = kvm_shutdown;
+#ifdef CONFIG_KEXEC
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+ clocksource_register(&kvm_clock);
+ }
+}
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index b402c0f3f192..3cad17fe026b 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,7 +63,7 @@ static int __init mfgpt_fix(char *s)
/* The following udocumented bit resets the MFGPT timers */
val = 0xFF; dummy = 0;
- wrmsr(0x5140002B, val, dummy);
+ wrmsr(MSR_MFGPT_SETUP, val, dummy);
return 1;
}
__setup("mfgptfix", mfgpt_fix);
@@ -127,17 +127,17 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
* 6; that is, resets for 7 and 8 will be ignored. Is this
* a problem? -dilinger
*/
- msr = MFGPT_NR_MSR;
+ msr = MSR_MFGPT_NR;
mask = 1 << (timer + 24);
break;
case MFGPT_EVENT_NMI:
- msr = MFGPT_NR_MSR;
+ msr = MSR_MFGPT_NR;
mask = 1 << (timer + shift);
break;
case MFGPT_EVENT_IRQ:
- msr = MFGPT_IRQ_MSR;
+ msr = MSR_MFGPT_IRQ;
mask = 1 << (timer + shift);
break;
@@ -364,7 +364,8 @@ int __init mfgpt_timer_setup(void)
geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
/* Set up the clock event */
- mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32);
+ mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
+ mfgpt_clockevent.shift);
mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
&mfgpt_clockevent);
mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70744e344fa1..3e2c54dc8b29 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -686,13 +686,11 @@ void __init get_smp_config(void)
static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
{
- extern void __bad_mpf_size(void);
unsigned int *bp = phys_to_virt(base);
struct intel_mp_floating *mpf;
Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
- if (sizeof(*mpf) != 16)
- __bad_mpf_size();
+ BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
mpf = (struct intel_mp_floating *)bp;
@@ -801,7 +799,6 @@ void __init find_smp_config(void)
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
-#define MP_MAX_IOAPIC_PIN 127
extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
@@ -820,7 +817,7 @@ static int mp_find_ioapic(int gsi)
return -1;
}
-static u8 uniq_ioapic_id(u8 id)
+static u8 __init uniq_ioapic_id(u8 id)
{
#ifdef CONFIG_X86_32
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
@@ -909,14 +906,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
intsrc.mpc_dstirq = pin; /* INTIN# */
- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
- mp_irqs[mp_irq_entries] = intsrc;
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!\n");
+ MP_intsrc_info(&intsrc);
}
int es7000_plat;
@@ -985,23 +975,14 @@ void __init mp_config_acpi_legacy_irqs(void)
intsrc.mpc_srcbusirq = i; /* Identity mapped */
intsrc.mpc_dstirq = i;
- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
- intsrc.mpc_dstirq);
-
- mp_irqs[mp_irq_entries] = intsrc;
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!\n");
+ MP_intsrc_info(&intsrc);
}
}
int mp_register_gsi(u32 gsi, int triggering, int polarity)
{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
+ int ioapic;
+ int ioapic_pin;
#ifdef CONFIG_X86_32
#define MAX_GSI_NUM 4096
#define IRQ_COMPRESSION_START 64
@@ -1041,15 +1022,13 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
* with redundant pin->gsi mappings (but unique PCI devices);
* we only program the IOAPIC on the first.
*/
- bit = ioapic_pin % 32;
- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
- if (idx > 3) {
+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
"%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
ioapic_pin);
return gsi;
}
- if ((1 << bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
#ifdef CONFIG_X86_32
@@ -1059,7 +1038,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
#endif
}
- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1 << bit);
+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
#ifdef CONFIG_X86_32
/*
* For GSI >= 64, use IRQ compression
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3733412d1357..74f0c5ea2a03 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -366,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = {
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
- .alloc_pt = paravirt_nop,
- .alloc_pd = paravirt_nop,
- .alloc_pd_clone = paravirt_nop,
- .release_pt = paravirt_nop,
- .release_pd = paravirt_nop,
+ .alloc_pte = paravirt_nop,
+ .alloc_pmd = paravirt_nop,
+ .alloc_pmd_clone = paravirt_nop,
+ .alloc_pud = paravirt_nop,
+ .release_pte = paravirt_nop,
+ .release_pmd = paravirt_nop,
+ .release_pud = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 2edee22e9c30..e28ec497e142 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -43,6 +43,7 @@
#include <asm/system.h>
#include <asm/dma.h>
#include <asm/rio.h>
+#include <asm/bios_ebda.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3004d716539d..67e9b4a1e89d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -4,6 +4,8 @@
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/pm.h>
struct kmem_cache *task_xstate_cachep;
@@ -42,3 +44,118 @@ void arch_task_cache_init(void)
__alignof__(union thread_xstate),
SLAB_PANIC, NULL);
}
+
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
+void cpu_idle_wait(void)
+{
+ smp_mb();
+ /* kick all the CPUs so that they exit out of pm_idle */
+ smp_call_function(do_nothing, NULL, 0, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+ if (!need_resched()) {
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
+ }
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+ if (!need_resched()) {
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ } else
+ local_irq_enable();
+}
+
+
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
+ if (force_mwait)
+ return 1;
+ /* Any C1 states supported? */
+ return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
+}
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+ local_irq_enable();
+ cpu_relax();
+}
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+ static int selected;
+
+ if (selected)
+ return;
+#ifdef CONFIG_X86_SMP
+ if (pm_idle == poll_idle && smp_num_siblings > 1) {
+ printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+ " performance may degrade.\n");
+ }
+#endif
+ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
+ /*
+ * Skip, if setup has overridden idle.
+ * One CPU supports mwait => All CPUs supports mwait
+ */
+ if (!pm_idle) {
+ printk(KERN_INFO "using mwait in idle threads.\n");
+ pm_idle = mwait_idle;
+ }
+ }
+ selected = 1;
+}
+
+static int __init idle_setup(char *str)
+{
+ if (!strcmp(str, "poll")) {
+ printk("using polling idle threads.\n");
+ pm_idle = poll_idle;
+ } else if (!strcmp(str, "mwait"))
+ force_mwait = 1;
+ else
+ return -1;
+
+ boot_option_idle_override = 1;
+ return 0;
+}
+early_param("idle", idle_setup);
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 77de848bd1fb..f8476dfbb60d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -111,12 +111,10 @@ void default_idle(void)
*/
smp_mb();
- local_irq_disable();
- if (!need_resched()) {
+ if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
- local_irq_disable();
- }
- local_irq_enable();
+ else
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
local_irq_enable();
@@ -128,17 +126,6 @@ void default_idle(void)
EXPORT_SYMBOL(default_idle);
#endif
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
- local_irq_enable();
- cpu_relax();
-}
-
#ifdef CONFIG_HOTPLUG_CPU
#include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
@@ -196,6 +183,7 @@ void cpu_idle(void)
if (cpu_is_offline(cpu))
play_dead();
+ local_irq_disable();
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
idle();
}
@@ -206,104 +194,6 @@ void cpu_idle(void)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 0, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(ax, cx);
- else
- local_irq_enable();
- } else
- local_irq_enable();
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
- local_irq_enable();
- mwait_idle_with_hints(0, 0);
-}
-
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
-{
- if (force_mwait)
- return 1;
- /* Any C1 states supported? */
- return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
-}
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
- static int selected;
-
- if (selected)
- return;
-#ifdef CONFIG_X86_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
- }
-#endif
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * Skip, if setup has overridden idle.
- * One CPU supports mwait => All CPUs supports mwait
- */
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
-}
-
-static int __init idle_setup(char *str)
-{
- if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else
- return -1;
-
- boot_option_idle_override = 1;
- return 0;
-}
-early_param("idle", idle_setup);
-
void __show_registers(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 131c2ee7ac56..e2319f39988b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -106,26 +106,13 @@ void default_idle(void)
* test NEED_RESCHED:
*/
smp_mb();
- local_irq_disable();
- if (!need_resched()) {
+ if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
- local_irq_disable();
- }
- local_irq_enable();
+ else
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
}
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
- local_irq_enable();
- cpu_relax();
-}
-
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
@@ -192,110 +179,6 @@ void cpu_idle(void)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 0, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
- if (!need_resched()) {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
- } else {
- local_irq_enable();
- }
-}
-
-
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
-{
- if (force_mwait)
- return 1;
- /* Any C1 states supported? */
- return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
-}
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
- static int selected;
-
- if (selected)
- return;
-#ifdef CONFIG_X86_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
- }
-#endif
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * Skip, if setup has overridden idle.
- * One CPU supports mwait => All CPUs supports mwait
- */
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
-}
-
-static int __init idle_setup(char *str)
-{
- if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else
- return -1;
-
- boot_option_idle_override = 1;
- return 0;
-}
-early_param("idle", idle_setup);
-
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs * regs)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 559c1b027417..fb03ef380f0e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1207,97 +1207,16 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
-static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
{
- siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
- compat_siginfo_t __user *si32 = compat_ptr(data);
- siginfo_t ssi;
- int ret;
-
- if (request == PTRACE_SETSIGINFO) {
- memset(&ssi, 0, sizeof(siginfo_t));
- ret = copy_siginfo_from_user32(&ssi, si32);
- if (ret)
- return ret;
- if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
- return -EFAULT;
- }
- ret = sys_ptrace(request, pid, addr, (unsigned long)si);
- if (ret)
- return ret;
- if (request == PTRACE_GETSIGINFO) {
- if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
- return -EFAULT;
- ret = copy_siginfo_to_user32(si32, &ssi);
- }
- return ret;
-}
-
-asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
-{
- struct task_struct *child;
- struct pt_regs *childregs;
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
void __user *datap = compat_ptr(data);
int ret;
__u32 val;
switch (request) {
- case PTRACE_TRACEME:
- case PTRACE_ATTACH:
- case PTRACE_KILL:
- case PTRACE_CONT:
- case PTRACE_SINGLESTEP:
- case PTRACE_SINGLEBLOCK:
- case PTRACE_DETACH:
- case PTRACE_SYSCALL:
- case PTRACE_OLDSETOPTIONS:
- case PTRACE_SETOPTIONS:
- case PTRACE_SET_THREAD_AREA:
- case PTRACE_GET_THREAD_AREA:
-#ifdef X86_BTS
- case PTRACE_BTS_CONFIG:
- case PTRACE_BTS_STATUS:
- case PTRACE_BTS_SIZE:
- case PTRACE_BTS_GET:
- case PTRACE_BTS_CLEAR:
- case PTRACE_BTS_DRAIN:
-#endif
- return sys_ptrace(request, pid, addr, data);
-
- default:
- return -EINVAL;
-
- case PTRACE_PEEKTEXT:
- case PTRACE_PEEKDATA:
- case PTRACE_POKEDATA:
- case PTRACE_POKETEXT:
- case PTRACE_POKEUSR:
- case PTRACE_PEEKUSR:
- case PTRACE_GETREGS:
- case PTRACE_SETREGS:
- case PTRACE_SETFPREGS:
- case PTRACE_GETFPREGS:
- case PTRACE_SETFPXREGS:
- case PTRACE_GETFPXREGS:
- case PTRACE_GETEVENTMSG:
- break;
-
- case PTRACE_SETSIGINFO:
- case PTRACE_GETSIGINFO:
- return ptrace32_siginfo(request, pid, addr, data);
- }
-
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child))
- return PTR_ERR(child);
-
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
- if (ret < 0)
- goto out;
-
- childregs = task_pt_regs(child);
-
- switch (request) {
case PTRACE_PEEKUSR:
ret = getreg32(child, addr, &val);
if (ret == 0)
@@ -1343,12 +1262,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
sizeof(struct user32_fxsr_struct),
datap);
+ case PTRACE_GET_THREAD_AREA:
+ case PTRACE_SET_THREAD_AREA:
+ return arch_ptrace(child, request, addr, data);
+
default:
return compat_ptrace_request(child, request, addr, data);
}
- out:
- put_task_struct(child);
return ret;
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 19c9386ac118..a4a838306b2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -8,6 +8,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
+#include <asm/pgtable.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
@@ -15,7 +16,6 @@
# include <linux/dmi.h>
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
-# include <asm/pgtable.h>
#else
# include <asm/iommu.h>
#endif
@@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length)
/* Remap the kernel at virtual address zero, as well as offset zero
from the kernel segment. This assumes the kernel segment starts at
virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+ memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
/*
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void)
}
}
-static void native_machine_shutdown(void)
+void native_machine_shutdown(void)
{
/* Stop the cpus and apics */
#ifdef CONFIG_SMP
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = {
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
- .halt = native_machine_halt
+ .halt = native_machine_halt,
+#ifdef CONFIG_KEXEC
+ .crash_shutdown = native_machine_crash_shutdown,
+#endif
};
void machine_power_off(void)
@@ -498,3 +501,9 @@ void machine_halt(void)
machine_ops.halt();
}
+#ifdef CONFIG_KEXEC
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ machine_ops.crash_shutdown(regs);
+}
+#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 455d3c80960b..2283422af794 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -47,6 +47,7 @@
#include <linux/pfn.h>
#include <linux/pci.h>
#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
#include <video/edid.h>
@@ -389,7 +390,6 @@ unsigned long __init find_max_low_pfn(void)
return max_low_pfn;
}
-#define BIOS_EBDA_SEGMENT 0x40E
#define BIOS_LOWMEM_KILOBYTES 0x413
/*
@@ -420,8 +420,7 @@ static void __init reserve_ebda_region(void)
lowmem <<= 10;
/* start of EBDA area */
- ebda_addr = *(unsigned short *)__va(BIOS_EBDA_SEGMENT);
- ebda_addr <<= 4;
+ ebda_addr = get_bios_ebda();
/* Fixup: bios puts an EBDA in the top 64K segment */
/* of conventional memory, but does not adjust lowmem. */
@@ -822,6 +821,10 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = setup_memory();
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
#ifdef CONFIG_VMI
/*
* Must be after max_low_pfn is determined, and before kernel
@@ -829,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
*/
vmi_init();
#endif
+ kvm_guest_init();
/*
* NOTE: before this point _nobody_ is allowed to allocate
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c2ec3dcb6b99..a94fb959a87a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -42,6 +42,7 @@
#include <linux/ctype.h>
#include <linux/uaccess.h>
#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -116,7 +117,7 @@ extern int root_mountflags;
char __initdata command_line[COMMAND_LINE_SIZE];
-struct resource standard_io_resources[] = {
+static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "pic1", .start = 0x20, .end = 0x21,
@@ -190,6 +191,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
#endif
@@ -264,6 +266,28 @@ void __attribute__((weak)) __init memory_setup(void)
machine_specific_memory_setup();
}
+static void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ unsigned long pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ default:
+ break;
+ }
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+ free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+}
+
/*
* setup_arch - architecture-specific boot-time initializations
*
@@ -316,6 +340,8 @@ void __init setup_arch(char **cmdline_p)
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
+ parse_setup_data();
+
parse_early_param();
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
@@ -359,6 +385,10 @@ void __init setup_arch(char **cmdline_p)
io_delay_init();
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
#ifdef CONFIG_SMP
/* setup to use the early static init tables during kernel startup */
x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -397,8 +427,6 @@ void __init setup_arch(char **cmdline_p)
contig_initmem_init(0, end_pfn);
#endif
- early_res_to_bootmem();
-
dma32_reserve_bootmem();
#ifdef CONFIG_ACPI_SLEEP
@@ -465,6 +493,8 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
ioapic_init_mappings();
+ kvm_guest_init();
+
/*
* We trust e820 completely. No explicit ROM probing in memory.
*/
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index f1b117930837..8e05e7f7bd40 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -413,16 +413,6 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
regs->ss = __USER_DS;
regs->cs = __USER_CS;
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
return 0;
give_sigsegv:
@@ -501,16 +491,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->ss = __USER_DS;
regs->cs = __USER_CS;
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
return 0;
give_sigsegv:
@@ -566,6 +546,21 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
if (ret)
return ret;
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ */
+ regs->flags &= ~X86_EFLAGS_DF;
+
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
+
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 827179c5b32a..ccb2a4560c2d 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -285,14 +285,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
even if the handler happens to be interrupting 32-bit code. */
regs->cs = __USER_CS;
- /* This, by contrast, has nothing to do with segment registers -
- see include/asm-x86_64/uaccess.h for details. */
- set_fs(USER_DS);
-
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
return 0;
give_sigsegv:
@@ -380,6 +372,28 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
ret = setup_rt_frame(sig, ka, info, oldset, regs);
if (ret == 0) {
+ /*
+ * This has nothing to do with segment registers,
+ * despite the name. This magic affects uaccess.h
+ * macros' behavior. Reset it to the normal setting.
+ */
+ set_fs(USER_DS);
+
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ */
+ regs->flags &= ~X86_EFLAGS_DF;
+
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
+
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ade371f9663a..04c662ba18f1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
#ifdef CONFIG_X86_32
/* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
flush_tlb_all();
#endif
@@ -1058,7 +1058,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
check_tsc_sync_source(cpu);
local_irq_restore(flags);
- while (!cpu_isset(cpu, cpu_online_map)) {
+ while (!cpu_online(cpu)) {
cpu_relax();
touch_nmi_watchdog();
}
@@ -1168,7 +1168,7 @@ static void __init smp_cpu_index_default(void)
int i;
struct cpuinfo_x86 *c;
- for_each_cpu_mask(i, cpu_possible_map) {
+ for_each_possible_cpu(i) {
c = &cpu_data(i);
/* mark all to hotplug */
c->cpu_index = NR_CPUS;
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index 6878a9c2df5d..ae751094eba9 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -29,6 +29,7 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <asm/io.h>
+#include <asm/bios_ebda.h>
#include <asm/mach-summit/mach_mpparse.h>
static struct rio_table_hdr *rio_table_hdr __initdata;
@@ -140,8 +141,8 @@ void __init setup_summit(void)
int i, next_wpeg, next_bus = 0;
/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = *(unsigned short *)phys_to_virt(0x40Eul);
- ptr = (unsigned long)phys_to_virt(ptr << 4);
+ ptr = get_bios_ebda();
+ ptr = (unsigned long)phys_to_virt(ptr);
rio_table_hdr = NULL;
offset = 0x180;
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index df224a8774cb..a1f07d793202 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -195,9 +195,9 @@ static int __cpuinit init_smp_flush(void)
{
int i;
- for_each_cpu_mask(i, cpu_possible_map) {
+ for_each_possible_cpu(i)
spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
- }
+
return 0;
}
core_initcall(init_smp_flush);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 64580679861e..d8ccc3c6552f 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -33,7 +33,7 @@
/* We can free up trampoline after bootup if cpu hotplug is not supported. */
#ifndef CONFIG_HOTPLUG_CPU
-.section ".init.data","aw",@progbits
+.section ".cpuinit.data","aw",@progbits
#else
.section .rodata,"a",@progbits
#endif
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 471e694d6713..bde6f63e15d5 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -602,7 +602,7 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
{
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 12affe1f9bce..956f38927aa7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page)
* pdes need to be zeroed.
*/
if (type & VMI_PAGE_CLONE)
- limit = USER_PTRS_PER_PGD;
+ limit = KERNEL_PGD_BOUNDARY;
for (i = 0; i < limit; i++)
BUG_ON(ptr[i]);
}
@@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pt(u32 pfn)
+static void vmi_release_pte(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pd(u32 pfn)
+static void vmi_release_pmd(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -871,15 +871,15 @@ static inline int __init activate_vmi(void)
vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pt = vmi_allocate_pt;
- pv_mmu_ops.alloc_pd = vmi_allocate_pd;
- pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+ pv_mmu_ops.alloc_pte = vmi_allocate_pte;
+ pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
+ pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
}
vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
if (vmi_ops.release_page) {
- pv_mmu_ops.release_pt = vmi_release_pt;
- pv_mmu_ops.release_pd = vmi_release_pd;
+ pv_mmu_ops.release_pte = vmi_release_pte;
+ pv_mmu_ops.release_pmd = vmi_release_pmd;
}
/* Set linear is needed in all cases */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index b7ab3c335fae..fad3674b06a5 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -209,12 +209,6 @@ SECTIONS
EXIT_DATA
}
-/* vdso blob that is mapped into user space */
- vdso_start = . ;
- .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
- . = ALIGN(PAGE_SIZE);
- vdso_end = .;
-
#ifdef CONFIG_BLK_DEV_INITRD
. = ALIGN(PAGE_SIZE);
__initramfs_start = .;