summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c73
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S6
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/apb_timer.c4
-rw-r--r--arch/x86/kernel/apic/htirq.c173
-rw-r--r--arch/x86/kernel/apic/io_apic.c1303
-rw-r--r--arch/x86/kernel/apic/msi.c417
-rw-r--r--arch/x86/kernel/apic/vector.c448
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c6
-rw-r--r--arch/x86/kernel/cpu/common.c16
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c50
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c141
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c44
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c209
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c48
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/devicetree.c41
-rw-r--r--arch/x86/kernel/early-quirks.c8
-rw-r--r--arch/x86/kernel/entry_32.S1401
-rw-r--r--arch/x86/kernel/entry_64.S1653
-rw-r--r--arch/x86/kernel/head_32.S4
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c50
-rw-r--r--arch/x86/kernel/i8259.c8
-rw-r--r--arch/x86/kernel/irq.c62
-rw-r--r--arch/x86/kernel/irq_32.c6
-rw-r--r--arch/x86/kernel/irq_64.c6
-rw-r--r--arch/x86/kernel/irq_work.c10
-rw-r--r--arch/x86/kernel/irqinit.c10
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/mpparse.c7
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c1
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/process_64.c3
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/smp.c19
-rw-r--r--arch/x86/kernel/smpboot.c43
-rw-r--r--arch/x86/kernel/syscall_32.c33
-rw-r--r--arch/x86/kernel/syscall_64.c32
-rw-r--r--arch/x86/kernel/traps.c21
-rw-r--r--arch/x86/kernel/vsyscall_64.c335
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S37
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c70
-rw-r--r--arch/x86/kernel/vsyscall_trace.h29
-rw-r--r--arch/x86/kernel/x86_init.c9
54 files changed, 1843 insertions, 5050 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index febaf180621b..0f15af41bd80 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
CFLAGS_irq.o := -I$(src)/../include/asm/trace
-obj-y := process_$(BITS).o signal.o entry_$(BITS).o
+obj-y := process_$(BITS).o signal.o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
@@ -31,9 +31,6 @@ obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
-obj-y += syscall_$(BITS).o vsyscall_gtod.o
-obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
-obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dbe76a14c3c9..e49ee24da85e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,12 +31,12 @@
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/irq.h>
-#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
@@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
return 0;
}
-static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
- int polarity)
-{
- int irq, node;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-
- trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
- polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
- node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
- if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
- pr_warn("Failed to set pin attr for GSI%d\n", gsi);
- return -1;
- }
-
- irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
- if (irq < 0)
- return irq;
-
- /* Don't set up the ACPI SCI because it's already set up */
- if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi)
- mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
- return irq;
-}
-
-static void mp_unregister_gsi(u32 gsi)
-{
- int irq;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return;
-
- irq = mp_map_gsi_to_irq(gsi, 0);
- if (irq > 0)
- mp_unmap_irq(irq);
-}
-
-static struct irq_domain_ops acpi_irqdomain_ops = {
- .map = mp_irqdomain_map,
- .unmap = mp_irqdomain_unmap,
-};
-
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
struct acpi_madt_io_apic *ioapic = NULL;
struct ioapic_domain_cfg cfg = {
.type = IOAPIC_DOMAIN_DYNAMIC,
- .ops = &acpi_irqdomain_ops,
+ .ops = &mp_ioapic_irqdomain_ops,
};
ioapic = (struct acpi_madt_io_apic *)header;
@@ -652,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
if (trigger == ACPI_LEVEL_SENSITIVE)
- eisa_set_level_irq(gsi);
+ elcr_set_level_irq(gsi);
#endif
return gsi;
@@ -663,10 +619,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int trigger, int polarity)
{
int irq = gsi;
-
#ifdef CONFIG_X86_IO_APIC
+ int node;
+ struct irq_alloc_info info;
+
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
mutex_lock(&acpi_ioapic_lock);
- irq = mp_register_gsi(dev, gsi, trigger, polarity);
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (irq >= 0 && enable_update_mptable &&
+ acpi_gbl_FADT.sci_interrupt != gsi)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
mutex_unlock(&acpi_ioapic_lock);
#endif
@@ -676,8 +643,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
static void acpi_unregister_gsi_ioapic(u32 gsi)
{
#ifdef CONFIG_X86_IO_APIC
+ int irq;
+
mutex_lock(&acpi_ioapic_lock);
- mp_unregister_gsi(gsi);
+ irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ if (irq > 0)
+ mp_unmap_irq(irq);
mutex_unlock(&acpi_ioapic_lock);
#endif
}
@@ -786,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
u64 addr;
struct ioapic_domain_cfg cfg = {
.type = IOAPIC_DOMAIN_DYNAMIC,
- .ops = &acpi_irqdomain_ops,
+ .ops = &mp_ioapic_irqdomain_ops,
};
ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index ae693b51ed8e..8c35df468104 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel)
pushfq
popq pt_regs_flags(%rax)
- movq $resume_point, saved_rip(%rip)
+ movq $.Lresume_point, saved_rip(%rip)
movq %rsp, saved_rsp
movq %rbp, saved_rbp
@@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel)
xorl %eax, %eax
call x86_acpi_enter_sleep_state
/* in case something went wrong, restore the machine status and go on */
- jmp resume_point
+ jmp .Lresume_point
.align 4
-resume_point:
+.Lresume_point:
/* We don't restore %rax, it must be 0 anyway */
movq $saved_context, %rax
movq saved_context_cr4(%rax), %rbx
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7fe097235376..c42827eb86cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -231,6 +231,15 @@ void __init arch_init_ideal_nops(void)
#endif
}
break;
+
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 > 0xf) {
+ ideal_nops = p6_nops;
+ return;
+ }
+
+ /* fall through */
+
default:
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 6a7c23ff21d3..ede92c3364d3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void)
static void apbt_setup_irq(struct apbt_dev *adev)
{
- /* timer0 irq has been setup early */
- if (adev->irq == 0)
- return;
-
irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
}
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 816f36e979ad..ae50d3454d78 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -3,6 +3,8 @@
*
* Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
* Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Add support of hierarchical irqdomain
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -14,78 +16,112 @@
#include <linux/device.h>
#include <linux/pci.h>
#include <linux/htirq.h>
+#include <asm/irqdomain.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/hypertransport.h>
+static struct irq_domain *htirq_domain;
+
/*
* Hypertransport interrupt support
*/
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
- struct ht_irq_msg msg;
-
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
- msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
- write_ht_irq_msg(irq, &msg);
-}
-
static int
ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- unsigned int dest;
+ struct irq_data *parent = data->parent_data;
int ret;
- ret = apic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
-
- target_ht_irq(data->irq, dest, cfg->vector);
- return IRQ_SET_MASK_OK_NOCOPY;
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret >= 0) {
+ struct ht_irq_msg msg;
+ struct irq_cfg *cfg = irqd_cfg(data);
+
+ fetch_ht_irq_msg(data->irq, &msg);
+ msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
+ HT_IRQ_LOW_DEST_ID_MASK);
+ msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
+ HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
+ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
+ write_ht_irq_msg(data->irq, &msg);
+ }
+
+ return ret;
}
static struct irq_chip ht_irq_chip = {
.name = "PCI-HT",
.irq_mask = mask_ht_irq,
.irq_unmask = unmask_ht_irq,
- .irq_ack = apic_ack_edge,
+ .irq_ack = irq_chip_ack_parent,
.irq_set_affinity = ht_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
{
- struct irq_cfg *cfg;
- struct ht_irq_msg msg;
- unsigned dest;
- int err;
+ struct ht_irq_cfg *ht_cfg;
+ struct irq_alloc_info *info = arg;
+ struct pci_dev *dev;
+ irq_hw_number_t hwirq;
+ int ret;
- if (disable_apic)
- return -ENXIO;
+ if (nr_irqs > 1 || !info)
+ return -EINVAL;
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (err)
- return err;
+ dev = info->ht_dev;
+ hwirq = (info->ht_idx & 0xFF) |
+ PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
+ (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
+ if (irq_find_mapping(domain, hwirq) > 0)
+ return -EEXIST;
- err = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus(), &dest);
- if (err)
- return err;
+ ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
+ if (!ht_cfg)
+ return -ENOMEM;
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0) {
+ kfree(ht_cfg);
+ return ret;
+ }
+
+ /* Initialize msg to a value that will never match the first write. */
+ ht_cfg->msg.address_lo = 0xffffffff;
+ ht_cfg->msg.address_hi = 0xffffffff;
+ ht_cfg->dev = info->ht_dev;
+ ht_cfg->update = info->ht_update;
+ ht_cfg->pos = info->ht_pos;
+ ht_cfg->idx = 0x10 + (info->ht_idx * 2);
+ irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
+ handle_edge_irq, ht_cfg, "edge");
+
+ return 0;
+}
+
+static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+ BUG_ON(nr_irqs != 1);
+ kfree(irq_data->chip_data);
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+static void htirq_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct ht_irq_msg msg;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
msg.address_lo =
HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
+ HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
HT_IRQ_LOW_VECTOR(cfg->vector) |
((apic->irq_dest_mode == 0) ?
HT_IRQ_LOW_DM_PHYSICAL :
@@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
HT_IRQ_LOW_MT_FIXED :
HT_IRQ_LOW_MT_ARBITRATED) |
HT_IRQ_LOW_IRQ_MASKED;
+ write_ht_irq_msg(irq_data->irq, &msg);
+}
- write_ht_irq_msg(irq, &msg);
+static void htirq_domain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct ht_irq_msg msg;
- irq_set_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
+ memset(&msg, 0, sizeof(msg));
+ write_ht_irq_msg(irq_data->irq, &msg);
+}
- dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+static const struct irq_domain_ops htirq_domain_ops = {
+ .alloc = htirq_domain_alloc,
+ .free = htirq_domain_free,
+ .activate = htirq_domain_activate,
+ .deactivate = htirq_domain_deactivate,
+};
- return 0;
+void arch_init_htirq_domain(struct irq_domain *parent)
+{
+ if (disable_apic)
+ return;
+
+ htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+ if (!htirq_domain)
+ pr_warn("failed to initialize irqdomain for HTIRQ.\n");
+ else
+ htirq_domain->parent = parent;
+}
+
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+ ht_irq_update_t *update)
+{
+ struct irq_alloc_info info;
+
+ if (!htirq_domain)
+ return -ENOSYS;
+
+ init_irq_alloc_info(&info, NULL);
+ info.ht_idx = idx;
+ info.ht_pos = pos;
+ info.ht_dev = dev;
+ info.ht_update = update;
+
+ return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
+ &info);
+}
+
+void arch_teardown_ht_irq(unsigned int irq)
+{
+ irq_domain_free_irqs(irq, 1);
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f4dc2462a1ac..845dc0df2002 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -18,6 +18,16 @@
* and Rolf G. Tews
* for testing these extensively
* Paul Diefenbaugh : Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ * We used to have a workaround for a bug in SiS chips which
+ * required to rewrite the index register for a read-modify-write
+ * operation as the chip lost the index information which was
+ * setup for the read already. We cache the data now, so that
+ * workaround has been removed.
*/
#include <linux/mm.h>
@@ -31,13 +41,13 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/bootmem.h>
+#include <asm/irqdomain.h>
#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -63,27 +73,31 @@
#define for_each_ioapic_pin(idx, pin) \
for_each_ioapic((idx)) \
for_each_pin((idx), (pin))
-
#define for_each_irq_pin(entry, head) \
list_for_each_entry(entry, &head, list)
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
static DEFINE_RAW_SPINLOCK(ioapic_lock);
static DEFINE_MUTEX(ioapic_mutex);
static unsigned int ioapic_dynirq_base;
static int ioapic_initialized;
-struct mp_pin_info {
+struct irq_pin_list {
+ struct list_head list;
+ int apic, pin;
+};
+
+struct mp_chip_data {
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
int trigger;
int polarity;
- int node;
- int set;
u32 count;
+ bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+ u32 gsi_base;
+ u32 gsi_end;
};
static struct ioapic {
@@ -101,7 +115,6 @@ static struct ioapic {
struct mp_ioapic_gsi gsi_config;
struct ioapic_domain_cfg irqdomain_cfg;
struct irq_domain *irqdomain;
- struct mp_pin_info *pin_info;
struct resource *iomem_res;
} ioapics[MAX_IO_APICS];
@@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx)
return ioapics[ioapic_idx].mp_config.apicaddr;
}
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
{
return &ioapics[ioapic_idx].gsi_config;
}
@@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic)
return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
}
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
{
return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
}
+static inline bool mp_is_legacy_irq(int irq)
+{
+ return irq >= 0 && irq < nr_legacy_irqs();
+}
+
/*
* Initialize all legacy IRQs and all pins on the first IOAPIC
* if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq)
if (!nr_legacy_irqs())
return 0;
- return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
- return ioapics[ioapic_idx].pin_info + pin;
+ return ioapic == 0 || mp_is_legacy_irq(irq);
}
static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m)
panic("Max # of irq sources exceeded!!\n");
}
-struct irq_pin_list {
- struct list_head list;
- int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
- return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
static void alloc_ioapic_saved_registers(int idx)
{
size_t size;
@@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx)
int __init arch_early_ioapic_init(void)
{
- struct irq_cfg *cfg;
- int i, node = cpu_to_node(0);
+ int i;
if (!nr_legacy_irqs())
io_apic_irqs = ~0UL;
@@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void)
for_each_ioapic(i)
alloc_ioapic_saved_registers(i);
- /*
- * For legacy IRQ's, start with assigning irq0 to irq15 to
- * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
- */
- for (i = 0; i < nr_legacy_irqs(); i++) {
- cfg = alloc_irq_and_cfg_at(i, node);
- cfg->vector = IRQ0_VECTOR + i;
- cpumask_setall(cfg->domain);
- }
-
return 0;
}
@@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(vector, &io_apic->eoi);
@@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
return readl(&io_apic->data);
}
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu
writel(value, &io_apic->data);
}
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
-
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
static void ioapic_mask_entry(int apic, int pin)
{
unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
+ union entry_union eu = { .entry.mask = IOAPIC_MASKED };
raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
{
struct irq_pin_list *entry;
/* don't allow duplicates */
- for_each_irq_pin(entry, cfg->irq_2_pin)
+ for_each_irq_pin(entry, data->irq_2_pin)
if (entry->apic == apic && entry->pin == pin)
return 0;
- entry = alloc_irq_pin_list(node);
+ entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
node, apic, pin);
@@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
}
entry->apic = apic;
entry->pin = pin;
+ list_add_tail(&entry->list, &data->irq_2_pin);
- list_add_tail(&entry->list, &cfg->irq_2_pin);
return 0;
}
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
struct irq_pin_list *tmp, *entry;
- list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
if (entry->apic == apic && entry->pin == pin) {
list_del(&entry->list);
kfree(entry);
@@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
}
}
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
{
- if (__add_pin_to_irq_node(cfg, node, apic, pin))
+ if (__add_pin_to_irq_node(data, node, apic, pin))
panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
}
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
int oldapic, int oldpin,
int newapic, int newpin)
{
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
@@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
}
/* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(cfg, node, newapic, newpin);
-}
-
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
- int mask_and, int mask_or,
- void (*final)(struct irq_pin_list *entry))
-{
- unsigned int reg, pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin * 2);
- reg &= mask_and;
- reg |= mask_or;
- io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
- if (final)
- final(entry);
+ add_pin_to_irq_node(data, node, newapic, newpin);
}
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
+ union entry_union eu;
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin)
- __io_apic_modify_irq(entry, mask_and, mask_or, final);
+ eu.entry = data->entry;
+ eu.w1 &= mask_and;
+ eu.w1 |= mask_or;
+ data->entry = eu.entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ if (final)
+ final(entry);
+ }
}
static void io_apic_sync(struct irq_pin_list *entry)
@@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry)
readl(&io_apic->data);
}
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
{
+ struct mp_chip_data *data = irq_data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
{
- mask_ioapic(irqd_cfg(data));
+ io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
{
+ struct mp_chip_data *data = irq_data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_ioapic(cfg);
+ __unmask_ioapic(data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_ioapic_irq(struct irq_data *data)
-{
- unmask_ioapic(irqd_cfg(data));
-}
-
/*
* IO-APIC versions below 0x20 don't support EOI register.
* For the record, here is the information about various versions:
@@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data)
* Otherwise, we simulate the EOI message manually by changing the trigger
* mode to edge and then back to level, with RTE being masked during this.
*/
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
{
if (mpc_ioapic_ver(apic) >= 0x20) {
io_apic_eoi(apic, vector);
@@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
/*
* Mask the entry and change the trigger mode to edge.
*/
- entry1.mask = 1;
+ entry1.mask = IOAPIC_MASKED;
entry1.trigger = IOAPIC_EDGE;
__ioapic_write_entry(apic, pin, entry1);
@@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
}
}
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- struct irq_pin_list *entry;
unsigned long flags;
+ struct irq_pin_list *entry;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin)
- x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
- cfg->vector);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, vector);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
* Make sure the entry is masked and re-read the contents to check
* if it is a level triggered pin and if the remote-IRR is set.
*/
- if (!entry.mask) {
- entry.mask = 1;
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
ioapic_write_entry(apic, pin, entry);
entry = ioapic_read_entry(apic, pin);
}
@@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
* doesn't clear the remote-IRR if the trigger mode is not
* set to level.
*/
- if (!entry.trigger) {
+ if (entry.trigger == IOAPIC_EDGE) {
entry.trigger = IOAPIC_LEVEL;
ioapic_write_entry(apic, pin, entry);
}
-
raw_spin_lock_irqsave(&ioapic_lock, flags);
- x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+ __eoi_ioapic_pin(apic, pin, entry.vector);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -706,8 +670,8 @@ void mask_ioapic_entries(void)
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
- if (!entry.mask) {
- entry.mask = 1;
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq)
#endif
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
* when listed as conforming in the MP table. */
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
+#define default_ISA_trigger(idx) (IOAPIC_EDGE)
+#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
/* EISA interrupts are always polarity zero and can be edge or level
* trigger depending on the ELCR value. If an interrupt is listed as
@@ -823,53 +787,55 @@ static int EISA_ELCR(unsigned int irq)
#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
* when listed as conforming in the MP table. */
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
+#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
+#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
static int irq_polarity(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].irqflag & 3)
- {
- case 0: /* conforms, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- polarity = default_ISA_polarity(idx);
- else
- polarity = default_PCI_polarity(idx);
- break;
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- pr_warn("broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- pr_warn("broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ switch (mp_irqs[idx].irqflag & 0x03) {
+ case 0:
+ /* conforms to spec, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ return default_ISA_polarity(idx);
+ else
+ return default_PCI_polarity(idx);
+ case 1:
+ return IOAPIC_POL_HIGH;
+ case 2:
+ pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ case 3:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_POL_LOW;
+ }
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_PCI:
+ case MP_BUS_ISA:
+ return trigger;
+ case MP_BUS_EISA:
+ return default_EISA_trigger(idx);
}
- return polarity;
+ pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+ return IOAPIC_LEVEL;
}
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ return trigger;
+}
+#endif
static int irq_trigger(int idx)
{
@@ -879,153 +845,227 @@ static int irq_trigger(int idx)
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].irqflag>>2) & 3)
- {
- case 0: /* conforms, ie. bus-type dependent */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- default:
- {
- pr_warn("broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
+ switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+ case 0:
+ /* conforms to spec, ie. bus-type dependent trigger mode */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+ /* Take EISA into account */
+ return eisa_irq_trigger(idx, bus, trigger);
+ case 1:
+ return IOAPIC_EDGE;
+ case 2:
+ pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ case 3:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_LEVEL;
+ }
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+ int trigger, int polarity)
+{
+ init_irq_alloc_info(info, NULL);
+ info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info->ioapic_node = node;
+ info->ioapic_trigger = trigger;
+ info->ioapic_polarity = polarity;
+ info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
#endif
- break;
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- pr_warn("broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- pr_warn("broken BIOS!!\n");
- trigger = 0;
- break;
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src,
+ u32 gsi, int ioapic_idx, int pin)
+{
+ int trigger, polarity;
+
+ copy_irq_alloc_info(dst, src);
+ dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic_pin = pin;
+ dst->ioapic_valid = 1;
+ if (src && src->ioapic_valid) {
+ dst->ioapic_node = src->ioapic_node;
+ dst->ioapic_trigger = src->ioapic_trigger;
+ dst->ioapic_polarity = src->ioapic_polarity;
+ } else {
+ dst->ioapic_node = NUMA_NO_NODE;
+ if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+ dst->ioapic_trigger = trigger;
+ dst->ioapic_polarity = polarity;
+ } else {
+ /*
+ * PCI interrupts are always active low level
+ * triggered.
+ */
+ dst->ioapic_trigger = IOAPIC_LEVEL;
+ dst->ioapic_polarity = IOAPIC_POL_LOW;
}
}
- return trigger;
}
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+ return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+ irq_flow_handler_t hdl;
+ bool fasteoi;
+
+ if (trigger) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
+ }
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
{
+ struct mp_chip_data *data = irq_get_chip_data(irq);
+
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+ * and polarity attirbutes. So allow the first user to reprogram the
+ * pin with real trigger and polarity attributes.
+ */
+ if (irq < nr_legacy_irqs() && data->count == 1) {
+ if (info->ioapic_trigger != data->trigger)
+ mp_register_handler(irq, data->trigger);
+ data->entry.trigger = data->trigger = info->ioapic_trigger;
+ data->entry.polarity = data->polarity = info->ioapic_polarity;
+ }
+
+ return data->trigger == info->ioapic_trigger &&
+ data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+ struct irq_alloc_info *info)
+{
+ bool legacy = false;
int irq = -1;
- int ioapic = (int)(long)domain->host_data;
int type = ioapics[ioapic].irqdomain_cfg.type;
switch (type) {
case IOAPIC_DOMAIN_LEGACY:
/*
- * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
- * GSIs on some weird platforms.
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first
+ * 16 GSIs on some weird platforms.
*/
- if (gsi < nr_legacy_irqs())
- irq = irq_create_mapping(domain, pin);
- else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+ if (!ioapic_initialized || gsi >= nr_legacy_irqs())
irq = gsi;
+ legacy = mp_is_legacy_irq(irq);
break;
case IOAPIC_DOMAIN_STRICT:
- if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
- irq = gsi;
+ irq = gsi;
break;
case IOAPIC_DOMAIN_DYNAMIC:
- irq = irq_create_mapping(domain, pin);
break;
default:
WARN(1, "ioapic: unknown irqdomain type %d\n", type);
- break;
+ return -1;
+ }
+
+ return __irq_domain_alloc_irqs(domain, irq, 1,
+ ioapic_alloc_attr_node(info),
+ info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+ int irq, int ioapic, int pin,
+ struct irq_alloc_info *info)
+{
+ struct mp_chip_data *data;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ int node = ioapic_alloc_attr_node(info);
+
+ /*
+ * Legacy ISA IRQ has already been allocated, just add pin to
+ * the pin list assoicated with this IRQ and program the IOAPIC
+ * entry. The IOAPIC entry
+ */
+ if (irq_data && irq_data->parent_data) {
+ if (!mp_check_pin_attr(irq, info))
+ return -EBUSY;
+ if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+ info->ioapic_pin))
+ return -ENOMEM;
+ } else {
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+ if (irq >= 0) {
+ irq_data = irq_domain_get_irq_data(domain, irq);
+ data = irq_data->chip_data;
+ data->isa_irq = true;
+ }
}
- return irq > 0 ? irq : -1;
+ return irq;
}
static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
- unsigned int flags)
+ unsigned int flags, struct irq_alloc_info *info)
{
int irq;
+ bool legacy = false;
+ struct irq_alloc_info tmp;
+ struct mp_chip_data *data;
struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
- struct mp_pin_info *info = mp_pin_info(ioapic, pin);
if (!domain)
- return -1;
+ return -ENOSYS;
- mutex_lock(&ioapic_mutex);
-
- /*
- * Don't use irqdomain to manage ISA IRQs because there may be
- * multiple IOAPIC pins sharing the same ISA IRQ number and
- * irqdomain only supports 1:1 mapping between IOAPIC pin and
- * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
- * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
- * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
- * available, and some BIOSes may use MP Interrupt Source records
- * to override IRQ numbers for PIRQs instead of reprogramming
- * the interrupt routing logic. Thus there may be multiple pins
- * sharing the same legacy IRQ number when ACPI is disabled.
- */
if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
- if (flags & IOAPIC_MAP_ALLOC) {
- if (info->count == 0 &&
- mp_irqdomain_map(domain, irq, pin) != 0)
- irq = -1;
+ legacy = mp_is_legacy_irq(irq);
+ }
- /* special handling for timer IRQ0 */
+ mutex_lock(&ioapic_mutex);
+ if (!(flags & IOAPIC_MAP_ALLOC)) {
+ if (!legacy) {
+ irq = irq_find_mapping(domain, pin);
if (irq == 0)
- info->count++;
+ irq = -ENOENT;
}
} else {
- irq = irq_find_mapping(domain, pin);
- if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
- irq = alloc_irq_from_domain(domain, gsi, pin);
- }
-
- if (flags & IOAPIC_MAP_ALLOC) {
- /* special handling for legacy IRQs */
- if (irq < nr_legacy_irqs() && info->count == 1 &&
- mp_irqdomain_map(domain, irq, pin) != 0)
- irq = -1;
-
- if (irq > 0)
- info->count++;
- else if (info->count == 0)
- info->set = 0;
+ ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+ if (legacy)
+ irq = alloc_isa_irq_from_domain(domain, irq,
+ ioapic, pin, &tmp);
+ else if ((irq = irq_find_mapping(domain, pin)) == 0)
+ irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+ else if (!mp_check_pin_attr(irq, &tmp))
+ irq = -EBUSY;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ data->count++;
+ }
}
-
mutex_unlock(&ioapic_mutex);
- return irq > 0 ? irq : -1;
+ return irq;
}
static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
}
#endif
- return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
}
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
{
int ioapic, pin, idx;
@@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
return -1;
- return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
}
void mp_unmap_irq(int irq)
{
- struct irq_data *data = irq_get_irq_data(irq);
- struct mp_pin_info *info;
- int ioapic, pin;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ struct mp_chip_data *data;
- if (!data || !data->domain)
+ if (!irq_data || !irq_data->domain)
return;
- ioapic = (int)(long)data->domain->host_data;
- pin = (int)data->hwirq;
- info = mp_pin_info(ioapic, pin);
+ data = irq_data->chip_data;
+ if (!data || data->isa_irq)
+ return;
mutex_lock(&ioapic_mutex);
- if (--info->count == 0) {
- info->set = 0;
- if (irq < nr_legacy_irqs() &&
- ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
- mp_irqdomain_unmap(data->domain, irq);
- else
- irq_dispose_mapping(irq);
- }
+ if (--data->count == 0)
+ irq_domain_free_irqs(irq, 1);
mutex_unlock(&ioapic_mutex);
}
@@ -1165,7 +1198,7 @@ out:
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
#ifdef CONFIG_X86_32
static inline int IO_APIC_irq_trigger(int irq)
@@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq)
}
#endif
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
- unsigned long trigger)
-{
- struct irq_chip *chip = &ioapic_chip;
- irq_flow_handler_t hdl;
- bool fasteoi;
-
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL) {
- irq_set_status_flags(irq, IRQ_LEVEL);
- fasteoi = true;
- } else {
- irq_clear_status_flags(irq, IRQ_LEVEL);
- fasteoi = false;
- }
-
- if (setup_remapped_irq(irq, cfg, chip))
- fasteoi = trigger != 0;
-
- hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
- irq_set_chip_and_handler_name(irq, chip, hdl,
- fasteoi ? "fasteoi" : "edge");
-}
-
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
- unsigned int destination, int vector,
- struct io_apic_irq_attr *attr)
-{
- memset(entry, 0, sizeof(*entry));
-
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = destination;
- entry->vector = vector;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = attr->trigger;
- entry->polarity = attr->polarity;
-
- /*
- * Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (attr->trigger)
- entry->mask = 1;
-
- return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
- struct io_apic_irq_attr *attr)
-{
- struct IO_APIC_route_entry entry;
- unsigned int dest;
-
- if (!IO_APIC_IRQ(irq))
- return;
-
- if (assign_irq_vector(irq, cfg, apic->target_cpus()))
- return;
-
- if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
- &dest)) {
- pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
- mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
- clear_irq_vector(irq, cfg);
-
- return;
- }
-
- apic_printk(APIC_VERBOSE,KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i Dest:%d)\n",
- attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
- cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
- if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
- pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
- clear_irq_vector(irq, cfg);
-
- return;
- }
-
- ioapic_register_intr(irq, cfg, attr->trigger);
- if (irq < nr_legacy_irqs())
- legacy_pic->mask(irq);
-
- ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
@@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void)
}
}
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
- unsigned int pin, int vector)
-{
- struct IO_APIC_route_entry entry;
- unsigned int dest;
-
- memset(&entry, 0, sizeof(entry));
-
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
- apic->target_cpus(), &dest)))
- dest = BAD_APICID;
-
- entry.dest_mode = apic->irq_dest_mode;
- entry.mask = 0; /* don't mask IRQ for edge */
- entry.dest = dest;
- entry.delivery_mode = apic->irq_delivery_mode;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
-
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
- "edge");
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
{
- int i;
-
- pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
- for (i = 0; i <= nr_entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- pr_debug(" %02x %02X ", i, entry.dest);
- pr_cont("%1d %1d %1d %1d %1d "
- "%1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector);
- }
+ raw_spin_lock_init(&ioapic_lock);
}
-void intel_ir_io_apic_print_entries(unsigned int apic,
- unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
{
int i;
+ char buf[256];
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
- pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+ printk(KERN_DEBUG "IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
- struct IR_IO_APIC_route_entry *ir_entry;
- struct IO_APIC_route_entry entry;
-
entry = ioapic_read_entry(apic, i);
-
- ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
- pr_debug(" %02x %04X ", i, ir_entry->index);
- pr_cont("%1d %1d %1d %1d %1d "
- "%1d %1d %X %02X\n",
- ir_entry->format,
- ir_entry->mask,
- ir_entry->trigger,
- ir_entry->irr,
- ir_entry->polarity,
- ir_entry->delivery_status,
- ir_entry->index2,
- ir_entry->zero,
- ir_entry->vector);
+ snprintf(buf, sizeof(buf),
+ " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i,
+ entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+ entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+ entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ entry.vector, entry.irr, entry.delivery_status);
+ if (ir_entry->format)
+ printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
+ buf, (ir_entry->index << 15) | ir_entry->index,
+ ir_entry->zero);
+ else
+ printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+ buf,
+ entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+ "logical " : "physical",
+ entry.dest, entry.delivery_mode);
}
}
-void ioapic_zap_locks(void)
-{
- raw_spin_lock_init(&ioapic_lock);
-}
-
static void __init print_IO_APIC(int ioapic_idx)
{
union IO_APIC_reg_00 reg_00;
@@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx)
}
printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+ io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
void __init print_IO_APICs(void)
{
int ioapic_idx;
- struct irq_cfg *cfg;
unsigned int irq;
- struct irq_chip *chip;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for_each_ioapic(ioapic_idx)
@@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void)
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
+ struct irq_chip *chip;
+ struct mp_chip_data *data;
chip = irq_get_chip(irq);
- if (chip != &ioapic_chip)
+ if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
continue;
-
- cfg = irq_cfg(irq);
- if (!cfg)
+ data = irq_get_chip_data(irq);
+ if (!data)
continue;
- if (list_empty(&cfg->irq_2_pin))
+ if (list_empty(&data->irq_2_pin))
continue;
+
printk(KERN_DEBUG "IRQ%d ", irq);
- for_each_irq_pin(entry, cfg->irq_2_pin)
+ for_each_irq_pin(entry, data->irq_2_pin)
pr_cont("-> %d:%d", entry->apic, entry->pin);
pr_cont("\n");
}
@@ -1564,15 +1441,12 @@ void native_disable_io_apic(void)
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest = read_apic_id();
+ entry.mask = IOAPIC_UNMASKED;
+ entry.trigger = IOAPIC_EDGE;
+ entry.polarity = IOAPIC_POL_HIGH;
+ entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry.delivery_mode = dest_ExtINT;
+ entry.dest = read_apic_id();
/*
* Add it to the IO-APIC irq-routing table:
@@ -1582,7 +1456,6 @@ void native_disable_io_apic(void)
if (cpu_has_apic || apic_from_smp_config())
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
}
/*
@@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void)
* This is not complete - we should be able to fake
* an edge even if it isn't on the 8259A...
*/
-
static unsigned int startup_ioapic_irq(struct irq_data *data)
{
int was_pending = 0, irq = data->irq;
@@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- __unmask_ioapic(irqd_cfg(data));
+ __unmask_ioapic(data->chip_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
-
- apic = entry->apic;
- pin = entry->pin;
-
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- }
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
- const struct cpumask *mask,
- bool force)
-{
- unsigned int dest, irq = data->irq;
- unsigned long flags;
- int ret;
-
- if (!config_enabled(CONFIG_SMP))
- return -EPERM;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = apic_set_affinity(data, mask, &dest);
- if (!ret) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, irqd_cfg(data));
- ret = IRQ_SET_MASK_OK_NOCOPY;
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return ret;
-}
-
atomic_t irq_mis_count;
#ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
struct irq_pin_list *entry;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
+ for_each_irq_pin(entry, data->irq_2_pin) {
unsigned int reg;
int pin;
@@ -1888,18 +1708,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
{
/* If we are moving the irq we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic(cfg);
+ mask_ioapic_irq(data);
return true;
}
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data,
- struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
{
if (unlikely(masked)) {
/* Only migrate the irq if the ack has been received.
@@ -1928,31 +1747,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data,
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- if (!io_apic_level_ack_pending(cfg))
+ if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic(cfg);
+ unmask_ioapic_irq(data);
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data,
- struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
{
}
#endif
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- int i, irq = data->irq;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
bool masked;
+ int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(data, cfg);
+ masked = ioapic_irqd_mask(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -2004,11 +1822,49 @@ static void ack_ioapic_level(struct irq_data *data)
*/
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
+ eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+ }
+
+ ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ ack_APIC_irq();
+ eoi_ioapic_pin(data->entry.vector, data);
+}
- eoi_ioapic_irq(irq, cfg);
+static int ioapic_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_pin_list *entry;
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+ cfg = irqd_cfg(irq_data);
+ data->entry.dest = cfg->dest_apicid;
+ data->entry.vector = cfg->vector;
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin,
+ data->entry);
}
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- ioapic_irqd_unmask(data, cfg, masked);
+ return ret;
}
static struct irq_chip ioapic_chip __read_mostly = {
@@ -2016,10 +1872,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_startup = startup_ioapic_irq,
.irq_mask = mask_ioapic_irq,
.irq_unmask = unmask_ioapic_irq,
- .irq_ack = apic_ack_edge,
- .irq_eoi = ack_ioapic_level,
- .irq_set_affinity = native_ioapic_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ir_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -2113,12 +1979,12 @@ static inline void __init unlock_ExtINT_logic(void)
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
+ entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry1.mask = IOAPIC_UNMASKED;
entry1.dest = hard_smp_processor_id();
entry1.delivery_mode = dest_ExtINT;
entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
+ entry1.trigger = IOAPIC_EDGE;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2152,6 +2018,25 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+ int irq = -1;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+ if (domain) {
+ struct irq_alloc_info info;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ info.ioapic_pin = pin;
+ mutex_lock(&ioapic_mutex);
+ irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+ mutex_unlock(&ioapic_mutex);
+ }
+
+ return irq;
+}
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -2162,7 +2047,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup);
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg(0);
+ struct irq_data *irq_data = irq_get_irq_data(0);
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
@@ -2174,7 +2061,6 @@ static inline void __init check_timer(void)
* get/set the timer IRQ vector:
*/
legacy_pic->mask(0);
- assign_irq_vector(0, cfg, apic->target_cpus());
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2215,23 +2101,21 @@ static inline void __init check_timer(void)
}
if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
+ /* Ok, does IRQ0 through the IOAPIC work? */
if (no_pin1) {
- add_pin_to_irq_node(cfg, node, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ mp_alloc_timer_irq(apic1, pin1);
} else {
- /* for edge trigger, setup_ioapic_irq already
- * leave it unmasked.
+ /*
+ * for edge trigger, it's already unmasked,
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
int idx;
idx = find_irq_entry(apic1, pin1, mp_INT);
if (idx != -1 && irq_trigger(idx))
- unmask_ioapic(cfg);
+ unmask_ioapic_irq(irq_get_chip_data(0));
}
+ irq_domain_activate_irq(irq_data);
if (timer_irq_works()) {
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -2251,8 +2135,8 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_activate_irq(irq_data);
legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2329,36 +2213,35 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- size_t size;
+ struct irq_alloc_info info;
+ struct irq_domain *parent;
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
- size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
- ip->pin_info = kzalloc(size, GFP_KERNEL);
- if (!ip->pin_info)
- return -ENOMEM;
-
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (!parent)
+ parent = x86_vector_domain;
+
ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
(void *)(long)ioapic);
- if(!ip->irqdomain) {
- kfree(ip->pin_info);
- ip->pin_info = NULL;
+ if (!ip->irqdomain)
return -ENOMEM;
- }
+
+ ip->irqdomain->parent = parent;
if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
cfg->type == IOAPIC_DOMAIN_STRICT)
ioapic_dynirq_base = max(ioapic_dynirq_base,
gsi_cfg->gsi_end + 1);
- if (gsi_cfg->gsi_base == 0)
- irq_set_default_host(ip->irqdomain);
-
return 0;
}
@@ -2368,8 +2251,6 @@ static void ioapic_destroy_irqdomain(int idx)
irq_domain_remove(ioapics[idx].irqdomain);
ioapics[idx].irqdomain = NULL;
}
- kfree(ioapics[idx].pin_info);
- ioapics[idx].pin_info = NULL;
}
void __init setup_IO_APIC(void)
@@ -2399,20 +2280,6 @@ void __init setup_IO_APIC(void)
ioapic_initialized = 1;
}
-/*
- * Called after all the initialization is done. If we didn't find any
- * APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
- return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
static void resume_ioapic_id(int ioapic_idx)
{
unsigned long flags;
@@ -2451,20 +2318,6 @@ static int __init ioapic_init_ops(void)
device_initcall(ioapic_init_ops);
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
- struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
- int ret;
-
- if (!cfg)
- return -EINVAL;
- ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
- if (!ret)
- setup_ioapic_irq(irq, cfg, attr);
- return ret;
-}
-
static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -2692,7 +2545,7 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
- x86_io_apic_ops.set_affinity(idata, mask, false);
+ irq_set_affinity(irq, mask);
}
}
@@ -2737,7 +2590,7 @@ static struct resource * __init ioapic_setup_resources(void)
return res;
}
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
@@ -2962,7 +2815,6 @@ int mp_unregister_ioapic(u32 gsi_base)
{
int ioapic, pin;
int found = 0;
- struct mp_pin_info *pin_info;
for_each_ioapic(ioapic)
if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -2975,11 +2827,17 @@ int mp_unregister_ioapic(u32 gsi_base)
}
for_each_pin(ioapic, pin) {
- pin_info = mp_pin_info(ioapic, pin);
- if (pin_info->count) {
- pr_warn("pin%d on IOAPIC%d is still in use.\n",
- pin, ioapic);
- return -EBUSY;
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+ int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ struct mp_chip_data *data;
+
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ if (data && data->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n",
+ pin, ioapic);
+ return -EBUSY;
+ }
}
}
@@ -3006,108 +2864,141 @@ int mp_ioapic_registered(u32 gsi_base)
return 0;
}
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
- int ioapic, int ioapic_pin,
- int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+ struct irq_alloc_info *info)
{
- irq_attr->ioapic = ioapic;
- irq_attr->ioapic_pin = ioapic_pin;
- irq_attr->trigger = trigger;
- irq_attr->polarity = polarity;
+ if (info && info->ioapic_valid) {
+ data->trigger = info->ioapic_trigger;
+ data->polarity = info->ioapic_polarity;
+ } else if (acpi_get_override_irq(gsi, &data->trigger,
+ &data->polarity) < 0) {
+ /* PCI interrupts are always active low level triggered. */
+ data->trigger = IOAPIC_LEVEL;
+ data->polarity = IOAPIC_POL_LOW;
+ }
}
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+ struct IO_APIC_route_entry *entry)
{
- int ioapic = (int)(long)domain->host_data;
- struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
- struct io_apic_irq_attr attr;
+ memset(entry, 0, sizeof(*entry));
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->dest = cfg->dest_apicid;
+ entry->vector = cfg->vector;
+ entry->trigger = data->trigger;
+ entry->polarity = data->polarity;
+ /*
+ * Mask level triggered irqs. Edge triggered irqs are masked
+ * by the irq core code in case they fire.
+ */
+ if (data->trigger == IOAPIC_LEVEL)
+ entry->mask = IOAPIC_MASKED;
+ else
+ entry->mask = IOAPIC_UNMASKED;
+}
- /* Get default attribute if not set by caller yet */
- if (!info->set) {
- u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret, ioapic, pin;
+ struct irq_cfg *cfg;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+ struct irq_alloc_info *info = arg;
- if (acpi_get_override_irq(gsi, &info->trigger,
- &info->polarity) < 0) {
- /*
- * PCI interrupts are always polarity one level
- * triggered.
- */
- info->trigger = 1;
- info->polarity = 1;
- }
- info->node = NUMA_NO_NODE;
+ if (!info || nr_irqs > 1)
+ return -EINVAL;
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data)
+ return -EINVAL;
- /*
- * setup_IO_APIC_irqs() programs all legacy IRQs with default
- * trigger and polarity attributes. Don't set the flag for that
- * case so the first legacy IRQ user could reprogram the pin
- * with real trigger and polarity attributes.
- */
- if (virq >= nr_legacy_irqs() || info->count)
- info->set = 1;
- }
- set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
- info->polarity);
+ ioapic = mp_irqdomain_ioapic_idx(domain);
+ pin = info->ioapic_pin;
+ if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ return -EEXIST;
- return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
- struct irq_data *data = irq_get_irq_data(virq);
- struct irq_cfg *cfg = irq_cfg(virq);
- int ioapic = (int)(long)domain->host_data;
- int pin = (int)data->hwirq;
+ info->ioapic_entry = &data->entry;
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0) {
+ kfree(data);
+ return ret;
+ }
+
+ INIT_LIST_HEAD(&data->irq_2_pin);
+ irq_data->hwirq = info->ioapic_pin;
+ irq_data->chip = (domain->parent == x86_vector_domain) ?
+ &ioapic_chip : &ioapic_ir_chip;
+ irq_data->chip_data = data;
+ mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+ cfg = irqd_cfg(irq_data);
+ add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ if (info->ioapic_entry)
+ mp_setup_entry(cfg, data, info->ioapic_entry);
+ mp_register_handler(virq, data->trigger);
+ if (virq < nr_legacy_irqs())
+ legacy_pic->mask(virq);
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+ virq, data->trigger, data->polarity, cfg->dest_apicid);
- ioapic_mask_entry(ioapic, pin);
- __remove_pin_from_irq(cfg, ioapic, pin);
- WARN_ON(!list_empty(&cfg->irq_2_pin));
- arch_teardown_hwirq(virq);
+ return 0;
}
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
- int ret = 0;
- int ioapic, pin;
- struct mp_pin_info *info;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -ENODEV;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- info = mp_pin_info(ioapic, pin);
- trigger = trigger ? 1 : 0;
- polarity = polarity ? 1 : 0;
-
- mutex_lock(&ioapic_mutex);
- if (!info->set) {
- info->trigger = trigger;
- info->polarity = polarity;
- info->node = node;
- info->set = 1;
- } else if (info->trigger != trigger || info->polarity != polarity) {
- ret = -EBUSY;
+ BUG_ON(nr_irqs != 1);
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+ WARN_ON(!list_empty(&data->irq_2_pin));
+ kfree(irq_data->chip_data);
}
- mutex_unlock(&ioapic_mutex);
-
- return ret;
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
{
- struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+ unsigned long flags;
+ struct irq_pin_list *entry;
+ struct mp_chip_data *data = irq_data->chip_data;
- printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
- physid_set_mask_of_physid(boot_cpu_physical_apicid,
- &phys_cpu_present_map);
-#endif
- setup_local_APIC();
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, data->entry);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
- io_apic_setup_irq_pin(0, 0, &attr);
- irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
- "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ /* It won't be called for IRQ with multiple IOAPIC pins associated */
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+ return (int)(long)domain->host_data;
}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .alloc = mp_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index d6ba2d660dc5..1a9d735e09c6 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -3,6 +3,8 @@
*
* Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
* Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Convert to hierarchical irqdomain
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -14,22 +16,23 @@
#include <linux/dmar.h>
#include <linux/hpet.h>
#include <linux/msi.h>
+#include <asm/irqdomain.h>
#include <asm/msidef.h>
#include <asm/hpet.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
-void native_compose_msi_msg(struct pci_dev *pdev,
- unsigned int irq, unsigned int dest,
- struct msi_msg *msg, u8 hpet_id)
+static struct irq_domain *msi_default_domain;
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_cfg *cfg = irqd_cfg(data);
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
- msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+ msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
msg->address_lo =
MSI_ADDR_BASE_LO |
@@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev,
((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU :
MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
+ MSI_ADDR_DEST_ID(cfg->dest_apicid);
msg->data =
MSI_DATA_TRIGGER_EDGE |
@@ -50,180 +53,201 @@ void native_compose_msi_msg(struct pci_dev *pdev,
MSI_DATA_VECTOR(cfg->vector);
}
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
- struct msi_msg *msg, u8 hpet_id)
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+ .name = "PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- struct irq_cfg *cfg;
- int err;
- unsigned dest;
+ struct irq_domain *domain;
+ struct irq_alloc_info info;
- if (disable_apic)
- return -ENXIO;
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_MSI;
+ info.msi_dev = dev;
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (err)
- return err;
+ domain = irq_remapping_get_irq_domain(&info);
+ if (domain == NULL)
+ domain = msi_default_domain;
+ if (domain == NULL)
+ return -ENOSYS;
- err = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus(), &dest);
- if (err)
- return err;
+ return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+}
- x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+void native_teardown_msi_irq(unsigned int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
- return 0;
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->msi_hwirq;
}
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- struct msi_msg msg;
- unsigned int dest;
- int ret;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+ init_irq_alloc_info(arg, NULL);
+ arg->msi_dev = pdev;
+ if (desc->msi_attrib.is_msix) {
+ arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+ } else {
+ arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+ arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ }
- ret = apic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
+ return 0;
+}
- __get_cached_msi_msg(data->msi_desc, &msg);
+static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+ arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+ .get_hwirq = pci_msi_get_hwirq,
+ .msi_prepare = pci_msi_prepare,
+ .set_desc = pci_msi_set_desc,
+};
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static struct msi_domain_info pci_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &pci_msi_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
- __pci_write_msi_msg(data->msi_desc, &msg);
+void arch_init_msi_domain(struct irq_domain *parent)
+{
+ if (disable_apic)
+ return;
- return IRQ_SET_MASK_OK_NOCOPY;
+ msi_default_domain = pci_msi_create_irq_domain(NULL,
+ &pci_msi_domain_info, parent);
+ if (!msi_default_domain)
+ pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
}
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+ .name = "IR-PCI-MSI",
.irq_unmask = pci_msi_unmask_irq,
.irq_mask = pci_msi_mask_irq,
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = msi_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
- unsigned int irq_base, unsigned int irq_offset)
-{
- struct irq_chip *chip = &msi_chip;
- struct msi_msg msg;
- unsigned int irq = irq_base + irq_offset;
- int ret;
-
- ret = msi_compose_msg(dev, irq, &msg, -1);
- if (ret < 0)
- return ret;
-
- irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
-
- /*
- * MSI-X message is written per-IRQ, the offset is always 0.
- * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
- */
- if (!irq_offset)
- pci_write_msi_msg(irq, &msg);
+static struct msi_domain_info pci_msi_ir_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &pci_msi_ir_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
- setup_remapped_irq(irq, irq_cfg(irq), chip);
+struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+{
+ return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+}
+#endif
- irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ dmar_msi_write(data->irq, msg);
+}
- dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq);
+static struct irq_chip dmar_msi_controller = {
+ .name = "DMAR-MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_write_msi_msg = dmar_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
- return 0;
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->dmar_id;
}
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int dmar_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
{
- struct msi_desc *msidesc;
- unsigned int irq;
- int node, ret;
+ irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+ handle_edge_irq, arg->dmar_data, "edge");
- /* Multiple MSI vectors only supported with interrupt remapping */
- if (type == PCI_CAP_ID_MSI && nvec > 1)
- return 1;
+ return 0;
+}
- node = dev_to_node(&dev->dev);
+static struct msi_domain_ops dmar_msi_domain_ops = {
+ .get_hwirq = dmar_msi_get_hwirq,
+ .msi_init = dmar_msi_init,
+};
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = irq_alloc_hwirq(node);
- if (!irq)
- return -ENOSPC;
+static struct msi_domain_info dmar_msi_domain_info = {
+ .ops = &dmar_msi_domain_ops,
+ .chip = &dmar_msi_controller,
+};
- ret = setup_msi_irq(dev, msidesc, irq, 0);
- if (ret < 0) {
- irq_free_hwirq(irq);
- return ret;
- }
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+ static struct irq_domain *dmar_domain;
+ static DEFINE_MUTEX(dmar_lock);
- }
- return 0;
-}
+ mutex_lock(&dmar_lock);
+ if (dmar_domain == NULL)
+ dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+ x86_vector_domain);
+ mutex_unlock(&dmar_lock);
-void native_teardown_msi_irq(unsigned int irq)
-{
- irq_free_hwirq(irq);
+ return dmar_domain;
}
-#ifdef CONFIG_DMAR_TABLE
-static int
-dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
+int dmar_alloc_hwirq(int id, int node, void *arg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- unsigned int dest, irq = data->irq;
- struct msi_msg msg;
- int ret;
-
- ret = apic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
+ struct irq_domain *domain = dmar_get_irq_domain();
+ struct irq_alloc_info info;
- dmar_msi_read(irq, &msg);
+ if (!domain)
+ return -1;
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+ info.dmar_id = id;
+ info.dmar_data = arg;
- dmar_msi_write(irq, &msg);
-
- return IRQ_SET_MASK_OK_NOCOPY;
+ return irq_domain_alloc_irqs(domain, 1, node, &info);
}
-static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .irq_unmask = dmar_msi_unmask,
- .irq_mask = dmar_msi_mask,
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = dmar_msi_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+void dmar_free_hwirq(int irq)
{
- int ret;
- struct msi_msg msg;
-
- ret = msi_compose_msg(NULL, irq, &msg, -1);
- if (ret < 0)
- return ret;
- dmar_msi_write(irq, &msg);
- irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
- "edge");
- return 0;
+ irq_domain_free_irqs(irq, 1);
}
#endif
@@ -231,56 +255,103 @@ int arch_setup_dmar_msi(unsigned int irq)
* MSI message composition
*/
#ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+ struct msi_domain_info *info = msi_get_domain_info(domain);
+
+ return (int)(long)info->data;
+}
-static int hpet_msi_set_affinity(struct irq_data *data,
- const struct cpumask *mask, bool force)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- struct msi_msg msg;
- unsigned int dest;
- int ret;
+ hpet_msi_write(data->handler_data, msg);
+}
- ret = apic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
+static struct irq_chip hpet_msi_controller = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
- hpet_msi_read(data->handler_data, &msg);
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hpet_index;
+}
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+ irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+ handle_edge_irq, arg->hpet_data, "edge");
- hpet_msi_write(data->handler_data, &msg);
+ return 0;
+}
- return IRQ_SET_MASK_OK_NOCOPY;
+static void hpet_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
}
-static struct irq_chip hpet_msi_type = {
- .name = "HPET_MSI",
- .irq_unmask = hpet_msi_unmask,
- .irq_mask = hpet_msi_mask,
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = hpet_msi_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .get_hwirq = hpet_msi_get_hwirq,
+ .msi_init = hpet_msi_init,
+ .msi_free = hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
};
-int default_setup_hpet_msi(unsigned int irq, unsigned int id)
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
{
- struct irq_chip *chip = &hpet_msi_type;
- struct msi_msg msg;
- int ret;
+ struct irq_domain *parent;
+ struct irq_alloc_info info;
+ struct msi_domain_info *domain_info;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.hpet_id = hpet_id;
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (parent == NULL)
+ parent = x86_vector_domain;
+ else
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ return msi_create_irq_domain(NULL, domain_info, parent);
+}
- ret = msi_compose_msg(NULL, irq, &msg, id);
- if (ret < 0)
- return ret;
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+ int dev_num)
+{
+ struct irq_alloc_info info;
- hpet_msi_write(irq_get_handler_data(irq), &msg);
- irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- setup_remapped_irq(irq, irq_cfg(irq), chip);
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.hpet_data = dev;
+ info.hpet_id = hpet_dev_id(domain);
+ info.hpet_index = dev_num;
- irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
- return 0;
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
}
#endif
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6cedd7914581..28eba2d38b15 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -3,6 +3,8 @@
*
* Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
* Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Enable support of hierarchical irqdomains
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -11,15 +13,28 @@
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compiler.h>
-#include <linux/irqdomain.h>
#include <linux/slab.h>
+#include <asm/irqdomain.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/i8259.h>
#include <asm/desc.h>
#include <asm/irq_remapping.h>
+struct apic_chip_data {
+ struct irq_cfg cfg;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
+ u8 move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
void lock_vector_lock(void)
{
@@ -34,71 +49,59 @@ void unlock_vector_lock(void)
raw_spin_unlock(&vector_lock);
}
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
{
- return irq_get_chip_data(irq);
+ if (!irq_data)
+ return NULL;
+
+ while (irq_data->parent_data)
+ irq_data = irq_data->parent_data;
+
+ return irq_data->chip_data;
}
struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
{
- return irq_data->chip_data;
+ struct apic_chip_data *data = apic_chip_data(irq_data);
+
+ return data ? &data->cfg : NULL;
}
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+struct irq_cfg *irq_cfg(unsigned int irq)
{
- struct irq_cfg *cfg;
+ return irqd_cfg(irq_get_irq_data(irq));
+}
- cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
- if (!cfg)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+ struct apic_chip_data *data;
+
+ data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+ if (!data)
return NULL;
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
- goto out_cfg;
- if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+ goto out_data;
+ if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
goto out_domain;
-#ifdef CONFIG_X86_IO_APIC
- INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
- return cfg;
+ return data;
out_domain:
- free_cpumask_var(cfg->domain);
-out_cfg:
- kfree(cfg);
+ free_cpumask_var(data->domain);
+out_data:
+ kfree(data);
return NULL;
}
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
{
- int res = irq_alloc_desc_at(at, node);
- struct irq_cfg *cfg;
-
- if (res < 0) {
- if (res != -EEXIST)
- return NULL;
- cfg = irq_cfg(at);
- if (cfg)
- return cfg;
+ if (data) {
+ free_cpumask_var(data->domain);
+ free_cpumask_var(data->old_domain);
+ kfree(data);
}
-
- cfg = alloc_irq_cfg(at, node);
- if (cfg)
- irq_set_chip_data(at, cfg);
- else
- irq_free_desc(at);
- return cfg;
-}
-
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
- if (!cfg)
- return;
- irq_set_chip_data(at, NULL);
- free_cpumask_var(cfg->domain);
- free_cpumask_var(cfg->old_domain);
- kfree(cfg);
}
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+ const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -114,36 +117,33 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
- cpumask_var_t tmp_mask;
- if (cfg->move_in_progress)
+ if (d->move_in_progress)
return -EBUSY;
- if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
- return -ENOMEM;
-
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
- cpumask_clear(cfg->old_domain);
+ cpumask_clear(d->old_domain);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
int new_cpu, vector, offset;
- apic->vector_allocation_domain(cpu, tmp_mask, mask);
+ apic->vector_allocation_domain(cpu, vector_cpumask, mask);
- if (cpumask_subset(tmp_mask, cfg->domain)) {
+ if (cpumask_subset(vector_cpumask, d->domain)) {
err = 0;
- if (cpumask_equal(tmp_mask, cfg->domain))
+ if (cpumask_equal(vector_cpumask, d->domain))
break;
/*
* New cpumask using the vector is a proper subset of
* the current in use mask. So cleanup the vector
* allocation for the members that are not used anymore.
*/
- cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
- cfg->move_in_progress =
- cpumask_intersects(cfg->old_domain, cpu_online_mask);
- cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ cpumask_andnot(d->old_domain, d->domain,
+ vector_cpumask);
+ d->move_in_progress =
+ cpumask_intersects(d->old_domain, cpu_online_mask);
+ cpumask_and(d->domain, d->domain, vector_cpumask);
break;
}
@@ -157,16 +157,18 @@ next:
}
if (unlikely(current_vector == vector)) {
- cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
- cpumask_andnot(tmp_mask, mask, cfg->old_domain);
- cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+ cpumask_or(d->old_domain, d->old_domain,
+ vector_cpumask);
+ cpumask_andnot(vector_cpumask, mask, d->old_domain);
+ cpu = cpumask_first_and(vector_cpumask,
+ cpu_online_mask);
continue;
}
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
if (per_cpu(vector_irq, new_cpu)[vector] >
VECTOR_UNDEFINED)
goto next;
@@ -174,55 +176,73 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (cfg->vector) {
- cpumask_copy(cfg->old_domain, cfg->domain);
- cfg->move_in_progress =
- cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ if (d->cfg.vector) {
+ cpumask_copy(d->old_domain, d->domain);
+ d->move_in_progress =
+ cpumask_intersects(d->old_domain, cpu_online_mask);
}
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
- cfg->vector = vector;
- cpumask_copy(cfg->domain, tmp_mask);
+ d->cfg.vector = vector;
+ cpumask_copy(d->domain, vector_cpumask);
err = 0;
break;
}
- free_cpumask_var(tmp_mask);
+
+ if (!err) {
+ /* cache destination APIC IDs into cfg->dest_apicid */
+ err = apic->cpu_mask_to_apicid_and(mask, d->domain,
+ &d->cfg.dest_apicid);
+ }
return err;
}
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+ const struct cpumask *mask)
{
int err;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
+ err = __assign_irq_vector(irq, data, mask);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+ struct apic_chip_data *data,
+ struct irq_alloc_info *info)
+{
+ if (info && info->mask)
+ return assign_irq_vector(irq, data, info->mask);
+ if (node != NUMA_NO_NODE &&
+ assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+ return 0;
+ return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
int cpu, vector;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- BUG_ON(!cfg->vector);
+ BUG_ON(!data->cfg.vector);
- vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ vector = data->cfg.vector;
+ for_each_cpu_and(cpu, data->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
- cfg->vector = 0;
- cpumask_clear(cfg->domain);
+ data->cfg.vector = 0;
+ cpumask_clear(data->domain);
- if (likely(!cfg->move_in_progress)) {
+ if (likely(!data->move_in_progress)) {
raw_spin_unlock_irqrestore(&vector_lock, flags);
return;
}
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -231,10 +251,95 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg)
break;
}
}
- cfg->move_in_progress = 0;
+ data->move_in_progress = 0;
raw_spin_unlock_irqrestore(&vector_lock, flags);
}
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irq_data && irq_data->chip_data) {
+ clear_irq_vector(virq + i, irq_data->chip_data);
+ free_apic_chip_data(irq_data->chip_data);
+#ifdef CONFIG_X86_IO_APIC
+ if (virq + i < nr_legacy_irqs())
+ legacy_irq_data[virq + i] = NULL;
+#endif
+ irq_domain_reset_irq_data(irq_data);
+ }
+ }
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct apic_chip_data *data;
+ struct irq_data *irq_data;
+ int i, err;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ /* Currently vector allocator can't guarantee contiguous allocations */
+ if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+ return -ENOSYS;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irq_data);
+#ifdef CONFIG_X86_IO_APIC
+ if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+ data = legacy_irq_data[virq + i];
+ else
+#endif
+ data = alloc_apic_chip_data(irq_data->node);
+ if (!data) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ irq_data->chip = &lapic_controller;
+ irq_data->chip_data = data;
+ irq_data->hwirq = virq + i;
+ err = assign_irq_vector_policy(virq, irq_data->node, data,
+ info);
+ if (err)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ x86_vector_free_irqs(domain, virq, i + 1);
+ return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+};
+
int __init arch_probe_nr_irqs(void)
{
int nr;
@@ -258,8 +363,43 @@ int __init arch_probe_nr_irqs(void)
return nr_legacy_irqs();
}
+#ifdef CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+ int i, node = cpu_to_node(0);
+ struct apic_chip_data *data;
+
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * ISA_IRQ_VECTOR(i) for all cpu's.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+ BUG_ON(!data);
+
+ data->cfg.vector = ISA_IRQ_VECTOR(i);
+ cpumask_setall(data->domain);
+ irq_set_chip_data(i, data);
+ }
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
int __init arch_early_irq_init(void)
{
+ init_legacy_irqs();
+
+ x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+ NULL);
+ BUG_ON(x86_vector_domain == NULL);
+ irq_set_default_host(x86_vector_domain);
+
+ arch_init_msi_domain(x86_vector_domain);
+ arch_init_htirq_domain(x86_vector_domain);
+
+ BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+
return arch_early_ioapic_init();
}
@@ -267,7 +407,7 @@ static void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
int irq, vector;
- struct irq_cfg *cfg;
+ struct apic_chip_data *data;
/*
* vector_lock will make sure that we don't run into irq vector
@@ -277,13 +417,13 @@ static void __setup_vector_irq(int cpu)
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
- cfg = irq_cfg(irq);
- if (!cfg)
+ data = apic_chip_data(irq_get_irq_data(irq));
+ if (!data)
continue;
- if (!cpumask_test_cpu(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, data->domain))
continue;
- vector = cfg->vector;
+ vector = data->cfg.vector;
per_cpu(vector_irq, cpu)[vector] = irq;
}
/* Mark the free vectors */
@@ -292,8 +432,8 @@ static void __setup_vector_irq(int cpu)
if (irq <= VECTOR_UNDEFINED)
continue;
- cfg = irq_cfg(irq);
- if (!cpumask_test_cpu(cpu, cfg->domain))
+ data = apic_chip_data(irq_get_irq_data(irq));
+ if (!cpumask_test_cpu(cpu, data->domain))
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
raw_spin_unlock(&vector_lock);
@@ -314,20 +454,20 @@ void setup_vector_irq(int cpu)
* legacy vector to irq mapping:
*/
for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+ per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
__setup_vector_irq(cpu);
}
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
{
- struct irq_cfg *cfg = irqd_cfg(data);
+ struct apic_chip_data *data = apic_chip_data(irq_data);
unsigned long flags;
int cpu;
raw_spin_lock_irqsave(&vector_lock, flags);
- cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
- apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+ cpu = cpumask_first_and(data->domain, cpu_online_mask);
+ apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -340,73 +480,76 @@ void apic_ack_edge(struct irq_data *data)
ack_APIC_irq();
}
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *dest, bool force)
{
- struct irq_cfg *cfg = irqd_cfg(data);
- unsigned int irq = data->irq;
- int err;
+ struct apic_chip_data *data = irq_data->chip_data;
+ int err, irq = irq_data->irq;
if (!config_enabled(CONFIG_SMP))
return -EPERM;
- if (!cpumask_intersects(mask, cpu_online_mask))
+ if (!cpumask_intersects(dest, cpu_online_mask))
return -EINVAL;
- err = assign_irq_vector(irq, cfg, mask);
- if (err)
- return err;
-
- err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+ err = assign_irq_vector(irq, data, dest);
if (err) {
- if (assign_irq_vector(irq, cfg, data->affinity))
+ struct irq_data *top = irq_get_irq_data(irq);
+
+ if (assign_irq_vector(irq, data, top->affinity))
pr_err("Failed to recover vector for irq %d\n", irq);
return err;
}
- cpumask_copy(data->affinity, mask);
-
- return 0;
+ return IRQ_SET_MASK_OK;
}
+static struct irq_chip lapic_controller = {
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
#ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void __send_cleanup_vector(struct apic_chip_data *data)
{
cpumask_var_t cleanup_mask;
if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
unsigned int i;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ for_each_cpu_and(i, data->old_domain, cpu_online_mask)
apic->send_IPI_mask(cpumask_of(i),
IRQ_MOVE_CLEANUP_VECTOR);
} else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
free_cpumask_var(cleanup_mask);
}
- cfg->move_in_progress = 0;
+ data->move_in_progress = 0;
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *data;
+
+ data = container_of(cfg, struct apic_chip_data, cfg);
+ if (data->move_in_progress)
+ __send_cleanup_vector(data);
}
asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
- ack_APIC_irq();
- irq_enter();
- exit_idle();
+ entering_ack_irq();
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
int irq;
unsigned int irr;
struct irq_desc *desc;
- struct irq_cfg *cfg;
+ struct apic_chip_data *data;
irq = __this_cpu_read(vector_irq[vector]);
@@ -417,8 +560,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
if (!desc)
continue;
- cfg = irq_cfg(irq);
- if (!cfg)
+ data = apic_chip_data(&desc->irq_data);
+ if (!data)
continue;
raw_spin_lock(&desc->lock);
@@ -427,10 +570,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
* Check if the irq migration is in progress. If so, we
* haven't received the cleanup request yet for this irq.
*/
- if (cfg->move_in_progress)
+ if (data->move_in_progress)
goto unlock;
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ if (vector == data->cfg.vector &&
+ cpumask_test_cpu(me, data->domain))
goto unlock;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -450,20 +594,21 @@ unlock:
raw_spin_unlock(&desc->lock);
}
- irq_exit();
+ exiting_irq();
}
static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
{
unsigned me;
+ struct apic_chip_data *data;
- if (likely(!cfg->move_in_progress))
+ data = container_of(cfg, struct apic_chip_data, cfg);
+ if (likely(!data->move_in_progress))
return;
me = smp_processor_id();
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- send_cleanup_vector(cfg);
+ if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+ __send_cleanup_vector(data);
}
void irq_complete_move(struct irq_cfg *cfg)
@@ -475,46 +620,11 @@ void irq_force_complete_move(int irq)
{
struct irq_cfg *cfg = irq_cfg(irq);
- if (!cfg)
- return;
-
- __irq_complete_move(cfg, cfg->vector);
+ if (cfg)
+ __irq_complete_move(cfg, cfg->vector);
}
#endif
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- int ret;
-
- cfg = alloc_irq_cfg(irq, node);
- if (!cfg)
- return -ENOMEM;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- if (!ret)
- irq_set_chip_data(irq, cfg);
- else
- free_irq_cfg(irq, cfg);
- return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
- struct irq_cfg *cfg = irq_cfg(irq);
-
- free_remapped_irq(irq);
- clear_irq_vector(irq, cfg);
- free_irq_cfg(irq, cfg);
-}
-
static void __init print_APIC_field(int base)
{
int i;
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6fae733e9194..3ffd925655e0 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
static bool x2apic_fadt_phys(void)
{
+#ifdef CONFIG_ACPI
if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "System requires x2apic physical mode\n");
return true;
}
+#endif
return false;
}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index b27f6ec90caa..8e3d22a1af94 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -68,7 +68,9 @@ void common(void) {
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+#ifdef CONFIG_X86_32
OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+#endif
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index dcaab87da629..d8f42f902a0f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -66,7 +66,7 @@ int main(void)
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
- DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+ DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
return 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 56cae1964a81..dd3a4baffe50 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -295,7 +295,7 @@ static int nearby_node(int apicid)
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u32 cores_per_cu = 1;
@@ -348,7 +348,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
@@ -433,7 +433,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned bits, ecx;
/* Multi core CPU? */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b28e5262a0a5..9fc5e3d9d9c8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -491,7 +491,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
void detect_ht(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
static bool printed;
@@ -827,7 +827,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
if (c->cpuid_level >= 0x00000001) {
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
@@ -1009,7 +1009,7 @@ void enable_sep_cpu(void)
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
out:
put_cpu();
@@ -1138,10 +1138,6 @@ static __init int setup_disablecpuid(char *arg)
}
__setup("clearcpuid=", setup_disablecpuid);
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1189,10 +1185,10 @@ void syscall_init(void)
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1201,7 +1197,7 @@ void syscall_init(void)
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index edcb0e28c336..be4febc58b94 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned int cpu = c->cpu_index;
#endif
@@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
#endif
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l3_id;
#endif
}
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
/*
* If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
* turns means that the only possibility is SMT (as indicated in
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 95cf78d44ab4..df919ff103c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1053,6 +1053,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
+ int lmce = 0;
prev_state = ist_enter(regs);
@@ -1080,11 +1081,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
kill_it = 1;
/*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
+ * Check if this MCE is signaled to only this logical processor
*/
- order = mce_start(&no_way_out);
+ if (m.mcgstatus & MCG_STATUS_LMCES)
+ lmce = 1;
+ else {
+ /*
+ * Go through all the banks in exclusion of the other CPUs.
+ * This way we don't report duplicated events on shared banks
+ * because the first one to see it will clear it.
+ * If this is a Local MCE, then no need to perform rendezvous.
+ */
+ order = mce_start(&no_way_out);
+ }
+
for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
if (!test_bit(i, valid_banks))
@@ -1161,8 +1171,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Do most of the synchronization with other CPUs.
* When there's any problem use only local no_way_out state.
*/
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
+ if (!lmce) {
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ } else {
+ /*
+ * Local MCE skipped calling mce_reign()
+ * If we found a fatal error, we need to panic here.
+ */
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Machine check from unknown source",
+ NULL, NULL);
+ }
/*
* At insane "tolerant" levels we take no action. Otherwise
@@ -1643,10 +1663,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
- case X86_VENDOR_AMD:
+
+ case X86_VENDOR_AMD: {
+ u32 ebx = cpuid_ebx(0x80000007);
+
mce_amd_feature_init(c);
- mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+ mce_flags.overflow_recov = !!(ebx & BIT(0));
+ mce_flags.succor = !!(ebx & BIT(1));
break;
+ }
+
default:
break;
}
@@ -1982,6 +2008,7 @@ void mce_disable_bank(int bank)
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
@@ -2005,6 +2032,8 @@ static int __init mcheck_enable(char *str)
cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
@@ -2014,11 +2043,8 @@ static int __init mcheck_enable(char *str)
else if (!strcmp(str, "bios_cmci_threshold"))
cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
- get_option(&str, &(cfg->tolerant));
- if (*str == ',') {
- ++str;
+ if (get_option(&str, &cfg->tolerant) == 2)
get_option(&str, &(cfg->monarch_timeout));
- }
} else {
pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 55ad9b37cae8..e99b15077e94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,19 +1,13 @@
/*
- * (c) 2005-2012 Advanced Micro Devices, Inc.
+ * (c) 2005-2015 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
- *
* Maintained by: Borislav Petkov <bp@alien8.de>
*
- * April 2006
- * - added support for AMD Family 0x10 processors
- * May 2012
- * - major scrubbing
- *
- * All MC4_MISCi registers are shared between multi-cores
+ * All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -32,6 +26,7 @@
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
@@ -47,6 +42,13 @@
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+#define MASK_DEF_INT_TYPE 0x00000006
+#define DEF_LVT_OFF 0x2
+#define DEF_INT_TYPE_APIC 0x2
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
/*
* CPU Initialization
@@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
threshold_restart_bank(&tr);
};
-static int setup_APIC_mce(int reserved, int new)
+static int setup_APIC_mce_threshold(int reserved, int new)
{
if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
APIC_EILVT_MSG_FIX, 0))
@@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new)
return reserved;
}
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0;
+ int def_offset = -1, def_new;
+
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+ return;
+
+ def_new = (low & MASK_DEF_LVTOFF) >> 4;
+ if (!(low & MASK_DEF_LVTOFF)) {
+ pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+ def_new = DEF_LVT_OFF;
+ low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+ }
+
+ def_offset = setup_APIC_deferred_error(def_offset, def_new);
+ if ((def_offset == def_new) &&
+ (deferred_error_int_vector != amd_deferred_error_interrupt))
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+
+ low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+ wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
- offset = setup_APIC_mce(offset, new);
+ offset = setup_APIC_mce_threshold(offset, new);
if ((offset == new) &&
(mce_threshold_vector != amd_threshold_interrupt))
@@ -262,6 +304,73 @@ init:
mce_threshold_block_init(&b, offset);
}
}
+
+ if (mce_flags.succor)
+ deferred_error_interrupt_enable(c);
+}
+
+static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+{
+ struct mce m;
+ u64 status;
+
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+ if (!(status & MCI_STATUS_VAL))
+ return;
+
+ mce_setup(&m);
+
+ m.status = status;
+ m.bank = bank;
+
+ if (threshold_err)
+ m.misc = misc;
+
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+
+ mce_log(&m);
+ wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+}
+
+static inline void __smp_deferred_error_interrupt(void)
+{
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+}
+
+asmlinkage __visible void smp_deferred_error_interrupt(void)
+{
+ entering_irq();
+ __smp_deferred_error_interrupt();
+ exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+{
+ entering_irq();
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ __smp_deferred_error_interrupt();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ exiting_ack_irq();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ u64 status;
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+
+ if (!(status & MCI_STATUS_VAL) ||
+ !(status & MCI_STATUS_DEFERRED))
+ continue;
+
+ __log_error(bank, false, 0);
+ break;
+ }
}
/*
@@ -273,12 +382,12 @@ init:
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
+
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
int cpu = smp_processor_id();
unsigned int bank, block;
- struct mce m;
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void)
return;
log:
- mce_setup(&m);
- rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
- if (!(m.status & MCI_STATUS_VAL))
- return;
- m.misc = ((u64)high << 32) | low;
- m.bank = bank;
- mce_log(&m);
-
- wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+ __log_error(bank, true, ((u64)high << 32) | low);
}
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b4a41cf030ed..844f56c5616d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -91,6 +91,36 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+ * generate a #GP fault.
+ */
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+ if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+ (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+ return true;
+
+ return false;
+}
+
bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
@@ -405,8 +435,22 @@ static void intel_init_cmci(void)
cmci_recheck();
}
+void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
+ intel_init_lmce();
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 939155ffdece..aad4bd84b475 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- irq_enter();
- exit_idle();
-
+ entering_irq();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
- irq_exit();
+ exiting_irq();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff22e..70d7c93f4550 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
- (mtrr_state.enabled & 1)) {
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
printk(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d74f7b3c6ba..3b533cf37c74 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
return 0;
}
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- * corresponds only to [start:*partial_end].
- * Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+ int idx;
+
+ if (start >= 0x100000)
+ return MTRR_TYPE_INVALID;
+
+ /* 0x0 - 0x7FFFF */
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state.fixed_ranges[idx];
+ /* 0x80000 - 0xBFFFF */
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state.fixed_ranges[idx];
+ }
+
+ /* 0xC0000 - 0xFFFFF */
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ * returned corresponds only to [start:*partial_end]. Caller has
+ * to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+ int *repeat, u8 *uniform)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
*repeat = 0;
- if (!mtrr_state_set)
- return 0xFF;
-
- if (!mtrr_state.enabled)
- return 0xFF;
+ *uniform = 1;
- /* Make end inclusive end, instead of exclusive */
+ /* Make end inclusive instead of exclusive */
end--;
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state.have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state.enabled & 2))
- return mtrr_state.def_type;
-
- prev_match = 0xFF;
+ prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
+ unsigned short start_state, end_state, inclusive;
if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
+ inclusive = ((start < base) && (end > base));
- if (start_state != end_state) {
+ if ((start_state != end_state) || inclusive) {
/*
* We have start:end spanning across an MTRR.
- * We split the region into
- * either
- * (start:mtrr_end) (mtrr_end:end)
- * or
- * (start:mtrr_start) (mtrr_start:end)
+ * We split the region into either
+ *
+ * - start_state:1
+ * (start:mtrr_end)(mtrr_end:end)
+ * - end_state:1
+ * (start:mtrr_start)(mtrr_start:end)
+ * - inclusive:1
+ * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+ *
* depending on kind of overlap.
- * Return the type for first region and a pointer to
- * the start of second region so that caller will
- * lookup again on the second region.
- * Note: This way we handle multiple overlaps as well.
+ *
+ * Return the type of the first region and a pointer
+ * to the start of next region so that caller will be
+ * advised to lookup again after having adjusted start
+ * and end.
+ *
+ * Note: This way we handle overlaps with multiple
+ * entries and the default type properly.
*/
if (start_state)
*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
end = *partial_end - 1; /* end is inclusive */
*repeat = 1;
+ *uniform = 0;
}
if ((start & mask) != (base & mask))
continue;
curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
+ if (prev_match == MTRR_TYPE_INVALID) {
prev_match = curr_match;
continue;
}
+ *uniform = 0;
if (check_type_overlap(&prev_match, &curr_match))
return curr_match;
}
- if (mtrr_tom2) {
- if (start >= (1ULL<<32) && (end < mtrr_tom2))
- return MTRR_TYPE_WRBACK;
- }
-
- if (prev_match != 0xFF)
+ if (prev_match != MTRR_TYPE_INVALID)
return prev_match;
return mtrr_state.def_type;
}
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
- u8 type, prev_type;
+ u8 type, prev_type, is_uniform = 1, dummy;
int repeat;
u64 partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ if (!mtrr_state_set)
+ return MTRR_TYPE_INVALID;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_INVALID;
+
+ /*
+ * Look up the fixed ranges first, which take priority over
+ * the variable ranges.
+ */
+ if ((start < 0x100000) &&
+ (mtrr_state.have_fixed) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ is_uniform = 0;
+ type = mtrr_type_lookup_fixed(start, end);
+ goto out;
+ }
+
+ /*
+ * Look up the variable ranges. Look of multiple ranges matching
+ * this address and pick type as per MTRR precedence.
+ */
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &is_uniform);
/*
* Common path is with repeat = 0.
* However, we can have cases where [start:end] spans across some
- * MTRR range. Do repeated lookups for that case here.
+ * MTRR ranges and/or the default type. Do repeated lookups for
+ * that case here.
*/
while (repeat) {
prev_type = type;
start = partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ is_uniform = 0;
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &dummy);
if (check_type_overlap(&prev_type, &type))
- return type;
+ goto out;
}
+ if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+ type = MTRR_TYPE_WRBACK;
+
+out:
+ *uniform = is_uniform;
return type;
}
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
pr_debug("MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
+ ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+ "en" : "dis");
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
print_fixed_last();
}
pr_debug("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
+ mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
}
/* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
post_set();
local_irq_restore(flags);
+
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ea5f363a1948..e7ed0d8ebacb 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -59,6 +59,12 @@
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+ return __mtrr_enabled;
+}
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
int i, replace, error;
mtrr_type ltype;
- if (!mtrr_if)
+ if (!mtrr_enabled())
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
@@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size)
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
unsigned long lbase, lsize;
int error = -EINVAL;
- if (!mtrr_if)
- return -ENXIO;
+ if (!mtrr_enabled())
+ return -ENODEV;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
@@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del);
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
@@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
{
int ret;
- if (pat_enabled)
+ if (pat_enabled() || !mtrr_enabled())
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle)
EXPORT_SYMBOL(arch_phys_wc_del);
/*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
/*
* HACK ALERT!
@@ -734,10 +747,12 @@ void __init mtrr_bp_init(void)
}
if (mtrr_if) {
+ __mtrr_enabled = true;
set_num_var_ranges();
init_table();
if (use_intel()) {
- get_mtrr_state();
+ /* BIOS may override */
+ __mtrr_enabled = get_mtrr_state();
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
@@ -745,10 +760,16 @@ void __init mtrr_bp_init(void)
}
}
}
+
+ if (!mtrr_enabled())
+ pr_info("MTRR: Disabled\n");
}
void mtrr_ap_init(void)
{
+ if (!mtrr_enabled())
+ return;
+
if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
@@ -774,6 +795,9 @@ void mtrr_save_state(void)
{
int first_cpu;
+ if (!mtrr_enabled())
+ return;
+
get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +806,8 @@ void mtrr_save_state(void)
void set_mtrr_aps_delayed_init(void)
{
+ if (!mtrr_enabled())
+ return;
if (!use_intel())
return;
@@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void)
*/
void mtrr_aps_init(void)
{
- if (!use_intel())
+ if (!use_intel() || !mtrr_enabled())
return;
/*
@@ -810,7 +836,7 @@ void mtrr_aps_init(void)
void mtrr_bp_restore(void)
{
- if (!use_intel())
+ if (!use_intel() || !mtrr_enabled())
return;
mtrr_if->set_all();
@@ -818,7 +844,7 @@ void mtrr_bp_restore(void)
static int __init mtrr_init_finialize(void)
{
- if (!mtrr_if)
+ if (!mtrr_enabled())
return 0;
if (use_intel()) {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31a27..951884dcc433 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
extern void set_mtrr_ops(const struct mtrr_ops *ops);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c76d3e37c6e1..e068d6683dba 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -22,6 +22,7 @@
#include <linux/elfcore.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 6367a780cc8c..5ee771859b6f 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,7 +4,6 @@
#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/io.h>
-#include <linux/irqdomain.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/of.h>
@@ -17,6 +16,7 @@
#include <linux/of_pci.h>
#include <linux/initrd.h>
+#include <asm/irqdomain.h>
#include <asm/hpet.h>
#include <asm/apic.h>
#include <asm/pci_x86.h>
@@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] =
},
};
-static int ioapic_xlate(struct irq_domain *domain,
- struct device_node *controller,
- const u32 *intspec, u32 intsize,
- irq_hw_number_t *out_hwirq, u32 *out_type)
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
{
+ struct of_phandle_args *irq_data = (void *)arg;
struct of_ioapic_type *it;
- u32 line, idx, gsi;
+ struct irq_alloc_info tmp;
- if (WARN_ON(intsize < 2))
+ if (WARN_ON(irq_data->args_count < 2))
return -EINVAL;
-
- line = intspec[0];
-
- if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
+ if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type))
return -EINVAL;
- it = &of_ioapic_type[intspec[1]];
+ it = &of_ioapic_type[irq_data->args[1]];
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+ tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+ tmp.ioapic_pin = irq_data->args[0];
- idx = (u32)(long)domain->host_data;
- gsi = mp_pin_to_gsi(idx, line);
- if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
- return -EBUSY;
-
- *out_hwirq = line;
- *out_type = it->out_type;
- return 0;
+ return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
}
-const struct irq_domain_ops ioapic_irq_domain_ops = {
- .map = mp_irqdomain_map,
- .unmap = mp_irqdomain_unmap,
- .xlate = ioapic_xlate,
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .alloc = dt_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
};
static void __init dtb_add_ioapic(struct device_node *dn)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index fe9f0b79a18b..5cb9a4d6f623 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
QFLAG_APPLY_ONCE, intel_graphics_stolen },
/*
- * HPET on current version of Baytrail platform has accuracy
- * problems, disable it for now:
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet:
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
deleted file mode 100644
index 1c309763e321..000000000000
--- a/arch/x86/kernel/entry_32.S
+++ /dev/null
@@ -1,1401 +0,0 @@
-/*
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- * ptrace needs to have all regs on the stack.
- * if the order here is changed, it needs to be
- * updated in fork.c:copy_process, signal.c:do_signal,
- * ptrace.c and ptrace.h
- *
- * 0(%esp) - %ebx
- * 4(%esp) - %ecx
- * 8(%esp) - %edx
- * C(%esp) - %esi
- * 10(%esp) - %edi
- * 14(%esp) - %ebp
- * 18(%esp) - %eax
- * 1C(%esp) - %ds
- * 20(%esp) - %es
- * 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
- * 2C(%esp) - orig_eax
- * 30(%esp) - %eip
- * 34(%esp) - %cs
- * 38(%esp) - %eflags
- * 3C(%esp) - %oldesp
- * 40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/err.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page_types.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
-#include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
-#include <asm/alternative-asm.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit syscall_trace_entry
-#define sysexit_audit syscall_exit_work
-#endif
-
- .section .entry.text, "ax"
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel restore_all
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl_cfi $0
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
- CFI_ADJUST_CFA_OFFSET -(4 + \pop)
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else /* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
- pushl_cfi %gs
- /*CFI_REL_OFFSET gs, 0*/
-.endm
-
-.macro POP_GS pop=0
-98: popl_cfi %gs
- /*CFI_RESTORE gs*/
- .if \pop <> 0
- add $\pop, %esp
- CFI_ADJUST_CFA_OFFSET -\pop
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b,99b)
-.endm
-
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b,99b)
-.endm
-
-.macro GS_TO_REG reg
- movl %gs, \reg
- /*CFI_REGISTER gs, \reg*/
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, PT_GS*/
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
-.endm
-
-#endif /* CONFIG_X86_32_LAZY_GS */
-
-.macro SAVE_ALL
- cld
- PUSH_GS
- pushl_cfi %fs
- /*CFI_REL_OFFSET fs, 0;*/
- pushl_cfi %es
- /*CFI_REL_OFFSET es, 0;*/
- pushl_cfi %ds
- /*CFI_REL_OFFSET ds, 0;*/
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
- movl $(__USER_DS), %edx
- movl %edx, %ds
- movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
- SET_KERNEL_GS %edx
-.endm
-
-.macro RESTORE_INT_REGS
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebp
- CFI_RESTORE ebp
- popl_cfi %eax
- CFI_RESTORE eax
-.endm
-
-.macro RESTORE_REGS pop=0
- RESTORE_INT_REGS
-1: popl_cfi %ds
- /*CFI_RESTORE ds;*/
-2: popl_cfi %es
- /*CFI_RESTORE es;*/
-3: popl_cfi %fs
- /*CFI_RESTORE fs;*/
- POP_GS \pop
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.popsection
- _ASM_EXTABLE(1b,4b)
- _ASM_EXTABLE(2b,5b)
- _ASM_EXTABLE(3b,6b)
- POP_GS_EX
-.endm
-
-.macro RING0_INT_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 3*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 4*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
- CFI_OFFSET eip, PT_EIP-PT_OLDESP
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
- CFI_OFFSET eax, PT_EAX-PT_OLDESP
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP
- CFI_OFFSET edi, PT_EDI-PT_OLDESP
- CFI_OFFSET esi, PT_ESI-PT_OLDESP
- CFI_OFFSET edx, PT_EDX-PT_OLDESP
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP
- CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
-ENTRY(ret_from_fork)
- CFI_STARTPROC
- pushl_cfi %eax
- call schedule_tail
- GET_THREAD_INFO(%ebp)
- popl_cfi %eax
- pushl_cfi $0x0202 # Reset kernel eflags
- popfl_cfi
- jmp syscall_exit
- CFI_ENDPROC
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
- CFI_STARTPROC
- pushl_cfi %eax
- call schedule_tail
- GET_THREAD_INFO(%ebp)
- popl_cfi %eax
- pushl_cfi $0x0202 # Reset kernel eflags
- popfl_cfi
- movl PT_EBP(%esp),%eax
- call *PT_EBX(%esp)
- movl $0,PT_EAX(%esp)
- jmp syscall_exit
- CFI_ENDPROC
-ENDPROC(ret_from_kernel_thread)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
- ALIGN
- RING0_PTREGS_FRAME
-ret_from_exception:
- preempt_stop(CLBR_ANY)
-ret_from_intr:
- GET_THREAD_INFO(%ebp)
-#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
- /*
- * We can be coming here from child spawned by kernel_thread().
- */
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
-#endif
- cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
- jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
-need_resched:
- cmpl $0,PER_CPU_VAR(__preempt_count)
- jnz restore_all
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
- call preempt_schedule_irq
- jmp need_resched
-END(resume_kernel)
-#endif
- CFI_ENDPROC
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
- the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
-
- # sysenter call handler stub
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 0
- CFI_REGISTER esp, ebp
- movl TSS_sysenter_sp0(%esp),%esp
-sysenter_past_esp:
- /*
- * Interrupts are disabled here, but we can't trace it until
- * enough kernel state to call TRACE_IRQS_OFF can be called - but
- * we immediately enable interrupts at that point anyway.
- */
- pushl_cfi $__USER_DS
- /*CFI_REL_OFFSET ss, 0*/
- pushl_cfi %ebp
- CFI_REL_OFFSET esp, 0
- pushfl_cfi
- orl $X86_EFLAGS_IF, (%esp)
- pushl_cfi $__USER_CS
- /*CFI_REL_OFFSET cs, 0*/
- /*
- * Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary: TI_sysenter_return
- * is relative to thread_info, which is at the bottom of the
- * kernel stack page. 4*4 means the 4 words pushed above;
- * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
- * and THREAD_SIZE takes us to the bottom.
- */
- pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
- CFI_REL_OFFSET eip, 0
-
- pushl_cfi %eax
- SAVE_ALL
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
- cmpl $__PAGE_OFFSET-3,%ebp
- jae syscall_fault
- ASM_STAC
-1: movl (%ebp),%ebp
- ASM_CLAC
- movl %ebp,PT_EBP(%esp)
- _ASM_EXTABLE(1b,syscall_fault)
-
- GET_THREAD_INFO(%ebp)
-
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz sysenter_audit
-sysenter_do_call:
- cmpl $(NR_syscalls), %eax
- jae sysenter_badsys
- call *sys_call_table(,%eax,4)
-sysenter_after_call:
- movl %eax,PT_EAX(%esp)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jnz sysexit_audit
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp,%ebp
- TRACE_IRQS_ON
-1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
- ENABLE_INTERRUPTS_SYSEXIT
-
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
- jnz syscall_trace_entry
- /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
- movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
- /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
- pushl_cfi PT_ESI(%esp) /* a3: 5th arg */
- pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */
- call __audit_syscall_entry
- popl_cfi %ecx /* get that remapped edx off the stack */
- popl_cfi %ecx /* get that remapped esi off the stack */
- movl PT_EAX(%esp),%eax /* reload syscall number */
- jmp sysenter_do_call
-
-sysexit_audit:
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- movl %eax,%edx /* second arg, syscall return value */
- cmpl $-MAX_ERRNO,%eax /* is it an error ? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%eax /* zero-extend that */
- call __audit_syscall_exit
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- movl PT_EAX(%esp),%eax /* reload syscall return value */
- jmp sysenter_exit
-#endif
-
- CFI_ENDPROC
-.pushsection .fixup,"ax"
-2: movl $0,PT_FS(%esp)
- jmp 1b
-.popsection
- _ASM_EXTABLE(1b,2b)
- PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
-
- # system call handler stub
-ENTRY(system_call)
- RING0_INT_FRAME # can't unwind into user space anyway
- ASM_CLAC
- pushl_cfi %eax # save orig_eax
- SAVE_ALL
- GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(NR_syscalls), %eax
- jae syscall_badsys
-syscall_call:
- call *sys_call_table(,%eax,4)
-syscall_after_call:
- movl %eax,PT_EAX(%esp) # store the return value
-syscall_exit:
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jnz syscall_exit_work
-
-restore_all:
- TRACE_IRQS_IRET
-restore_all_notrace:
-#ifdef CONFIG_X86_ESPFIX32
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
- # See comments in process.c:copy_thread() for details.
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
-#endif
-restore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
-irq_return:
- INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
- pushl $0 # no error code
- pushl $do_iret_error
- jmp error_code
-.previous
- _ASM_EXTABLE(irq_return,iret_exc)
-
-#ifdef CONFIG_X86_ESPFIX32
- CFI_RESTORE_STATE
-ldt_ss:
-#ifdef CONFIG_PARAVIRT
- /*
- * The kernel can't run on a non-flat stack if paravirt mode
- * is active. Rather than try to fixup the high bits of
- * ESP, bypass this code entirely. This may break DOSemu
- * and/or Wine support in a paravirt VM, although the option
- * is still available to implement the setting of the high
- * 16-bits in the INTERRUPT_RETURN paravirt-op.
- */
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
-#endif
-
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- shr $16, %edx
- mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
- mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl_cfi $__ESPFIX_SS
- pushl_cfi %eax /* new kernel esp */
- /* Disable interrupts, but do not irqtrace this section: we
- * will soon execute iret and the tracer was already set to
- * the irqstate after the iret */
- DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
-#endif
- CFI_ENDPROC
-ENDPROC(system_call)
-
- # perform work that needs to be done immediately before resumption
- ALIGN
- RING0_PTREGS_FRAME # can't unwind into user space anyway
-work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
-work_resched:
- call schedule
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
-
-work_notifysig: # deal with pending signals and
- # notify-resume requests
-#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jnz work_notifysig_v86 # returning to kernel-space or
- # vm86-space
-1:
-#else
- movl %esp, %eax
-#endif
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- movb PT_CS(%esp), %bl
- andb $SEGMENT_RPL_MASK, %bl
- cmpb $USER_RPL, %bl
- jb resume_kernel
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace
-
-#ifdef CONFIG_VM86
- ALIGN
-work_notifysig_v86:
- pushl_cfi %ecx # save ti_flags for do_notify_resume
- call save_v86_state # %eax contains pt_regs pointer
- popl_cfi %ecx
- movl %eax, %esp
- jmp 1b
-#endif
-END(work_pending)
-
- # perform syscall exit tracing
- ALIGN
-syscall_trace_entry:
- movl $-ENOSYS,PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
- /* What it returned is what we'll actually use. */
- cmpl $(NR_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
-END(syscall_trace_entry)
-
- # perform syscall exit tracing
- ALIGN
-syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
- movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
-END(syscall_exit_work)
- CFI_ENDPROC
-
- RING0_INT_FRAME # can't unwind into user space anyway
-syscall_fault:
- ASM_CLAC
- GET_THREAD_INFO(%ebp)
- movl $-EFAULT,PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
- movl $-ENOSYS,%eax
- jmp syscall_after_call
-END(syscall_badsys)
-
-sysenter_badsys:
- movl $-ENOSYS,%eax
- jmp sysenter_after_call
-END(sysenter_badsys)
- CFI_ENDPROC
-
-.macro FIXUP_ESPFIX_STACK
-/*
- * Switch back for ESPFIX stack to the normal zerobased stack
- *
- * We can't call C functions using the ESPFIX stack. This code reads
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
-#ifdef CONFIG_X86_ESPFIX32
- /* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
- shl $16, %eax
- addl %esp, %eax /* the adjusted stack pointer */
- pushl_cfi $__KERNEL_DS
- pushl_cfi %eax
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
-#endif
-.endm
-.macro UNWIND_ESPFIX_STACK
-#ifdef CONFIG_X86_ESPFIX32
- movl %ss, %eax
- /* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- /* switch to normal stack */
- FIXUP_ESPFIX_STACK
-27:
-#endif
-.endm
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-ENTRY(irq_entries_start)
- RING0_INT_FRAME
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- CFI_ADJUST_CFA_OFFSET -4
- .align 8
- .endr
-END(irq_entries_start)
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- ASM_CLAC
- addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
- SAVE_ALL
- TRACE_IRQS_OFF
- movl %esp,%eax
- call do_IRQ
- jmp ret_from_intr
-ENDPROC(common_interrupt)
- CFI_ENDPROC
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-ENTRY(name) \
- RING0_INT_FRAME; \
- ASM_CLAC; \
- pushl_cfi $~(nr); \
- SAVE_ALL; \
- TRACE_IRQS_OFF \
- movl %esp,%eax; \
- call fn; \
- jmp ret_from_intr; \
- CFI_ENDPROC; \
-ENDPROC(name)
-
-
-#ifdef CONFIG_TRACING
-#define TRACE_BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-#define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
- TRACE_BUILD_INTERRUPT(name, nr)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-ENTRY(coprocessor_error)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_coprocessor_error
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl_cfi $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
- X86_FEATURE_XMM
-#else
- pushl_cfi $do_simd_coprocessor_error
-#endif
- jmp error_code
- CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $-1 # mark this as an int
- pushl_cfi $do_device_not_available
- jmp error_code
- CFI_ENDPROC
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iret
- _ASM_EXTABLE(native_iret, iret_exc)
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-ENTRY(overflow)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_overflow
- jmp error_code
- CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_bounds
- jmp error_code
- CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_invalid_op
- jmp error_code
- CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_coprocessor_segment_overrun
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_invalid_TSS
- jmp error_code
- CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_segment_not_present
- jmp error_code
- CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_stack_segment
- jmp error_code
- CFI_ENDPROC
-END(stack_segment)
-
-ENTRY(alignment_check)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_alignment_check
- jmp error_code
- CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0 # no error code
- pushl_cfi $do_divide_error
- jmp error_code
- CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi machine_check_vector
- jmp error_code
- CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_spurious_interrupt_bug
- jmp error_code
- CFI_ENDPROC
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_XEN
-/* Xen doesn't set %esp to be precisely what the normal sysenter
- entrypoint expects, so fix it up before using the normal path. */
-ENTRY(xen_sysenter_target)
- RING0_INT_FRAME
- addl $5*4, %esp /* remove xen-provided frame */
- CFI_ADJUST_CFA_OFFSET -5*4
- jmp sysenter_past_esp
- CFI_ENDPROC
-
-ENTRY(xen_hypervisor_callback)
- CFI_STARTPROC
- pushl_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- TRACE_IRQS_OFF
-
- /* Check to see if we got the event in the critical
- region in xen_iret_direct, after we've reenabled
- events and checked for pending events. This simulates
- iret instruction's behaviour where it delivers a
- pending interrupt when enabling interrupts. */
- movl PT_EIP(%esp),%eax
- cmpl $xen_iret_start_crit,%eax
- jb 1f
- cmpl $xen_iret_end_crit,%eax
- jae 1f
-
- jmp xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1: mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
- CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-# 1. Fault while reloading DS, ES, FS or GS
-# 2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
- CFI_STARTPROC
- pushl_cfi %eax
- movl $1,%eax
-1: mov 4(%esp),%ds
-2: mov 8(%esp),%es
-3: mov 12(%esp),%fs
-4: mov 16(%esp),%gs
- /* EAX == 0 => Category 1 (Bad segment)
- EAX != 0 => Category 2 (Bad IRET) */
- testl %eax,%eax
- popl_cfi %eax
- lea 16(%esp),%esp
- CFI_ADJUST_CFA_OFFSET -16
- jz 5f
- jmp iret_exc
-5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- jmp ret_from_exception
- CFI_ENDPROC
-
-.section .fixup,"ax"
-6: xorl %eax,%eax
- movl %eax,4(%esp)
- jmp 1b
-7: xorl %eax,%eax
- movl %eax,8(%esp)
- jmp 2b
-8: xorl %eax,%eax
- movl %eax,12(%esp)
- jmp 3b
-9: xorl %eax,%eax
- movl %eax,16(%esp)
- jmp 4b
-.previous
- _ASM_EXTABLE(1b,6b)
- _ASM_EXTABLE(2b,7b)
- _ASM_EXTABLE(3b,8b)
- _ASM_EXTABLE(4b,9b)
-ENDPROC(xen_failsafe_callback)
-
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
-
-#endif /* CONFIG_XEN */
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
- ret
-END(mcount)
-
-ENTRY(ftrace_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- pushl $0 /* Pass NULL as regs pointer */
- movl 4*4(%esp), %eax
- movl 0x4(%ebp), %edx
- movl function_trace_op, %ecx
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
- call ftrace_stub
-
- addl $4,%esp /* skip NULL pointer */
- popl %edx
- popl %ecx
- popl %eax
-ftrace_ret:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- jmp ftrace_stub
-#endif
-
-.globl ftrace_stub
-ftrace_stub:
- ret
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- pushf /* push flags before compare (in cs location) */
-
- /*
- * i386 does not save SS and ESP when coming from kernel.
- * Instead, to get sp, &regs->sp is used (see ptrace.h).
- * Unfortunately, that means eflags must be at the same location
- * as the current return ip is. We move the return ip into the
- * ip location, and move flags into the return ip location.
- */
- pushl 4(%esp) /* save return ip into ip slot */
-
- pushl $0 /* Load 0 into orig_ax */
- pushl %gs
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
-
- movl 13*4(%esp), %eax /* Get the saved flags */
- movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
- /* clobbering return ip */
- movl $__KERNEL_CS,13*4(%esp)
-
- movl 12*4(%esp), %eax /* Load ip (1st parameter) */
- subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
- movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
- pushl %esp /* Save pt_regs as 4th parameter */
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- addl $4, %esp /* Skip pt_regs */
- movl 14*4(%esp), %eax /* Move flags back into cs */
- movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
- movl 12*4(%esp), %eax /* Get return ip from regs->ip */
- movl %eax, 14*4(%esp) /* Put return ip back for ret */
-
- popl %ebx
- popl %ecx
- popl %edx
- popl %esi
- popl %edi
- popl %ebp
- popl %eax
- popl %ds
- popl %es
- popl %fs
- popl %gs
- addl $8, %esp /* Skip orig_ax and ip */
- popf /* Pop flags at end (no addl to corrupt flags) */
- jmp ftrace_ret
-
- popf
- jmp ftrace_stub
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
- cmpl $__PAGE_OFFSET, %esp
- jb ftrace_stub /* Paging not enabled yet? */
-
- cmpl $ftrace_stub, ftrace_trace_function
- jnz trace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
- ret
-
- /* taken from glibc */
-trace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- lea 0x4(%ebp), %edx
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %eax
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
- ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
- pushl %eax
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, %ecx
- popl %edx
- popl %eax
- jmp *%ecx
-#endif
-
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $trace_do_page_fault
- jmp error_code
- CFI_ENDPROC
-END(trace_page_fault)
-#endif
-
-ENTRY(page_fault)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_page_fault
- ALIGN
-error_code:
- /* the function address is in %gs's slot on the stack */
- pushl_cfi %fs
- /*CFI_REL_OFFSET fs, 0*/
- pushl_cfi %es
- /*CFI_REL_OFFSET es, 0*/
- pushl_cfi %ds
- /*CFI_REL_OFFSET ds, 0*/
- pushl_cfi_reg eax
- pushl_cfi_reg ebp
- pushl_cfi_reg edi
- pushl_cfi_reg esi
- pushl_cfi_reg edx
- pushl_cfi_reg ecx
- pushl_cfi_reg ebx
- cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-END(page_fault)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
- cmpw $__KERNEL_CS, 4(%esp)
- jne \ok
-\label:
- movl TSS_sysenter_sp0 + \offset(%esp), %esp
- CFI_DEF_CFA esp, 0
- CFI_UNDEFINED eip
- pushfl_cfi
- pushl_cfi $__KERNEL_CS
- pushl_cfi $sysenter_past_esp
- CFI_REL_OFFSET eip, 0
-.endm
-
-ENTRY(debug)
- RING0_INT_FRAME
- ASM_CLAC
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
- pushl_cfi $-1 # mark this as an int
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-ENTRY(nmi)
- RING0_INT_FRAME
- ASM_CLAC
-#ifdef CONFIG_X86_ESPFIX32
- pushl_cfi %eax
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl_cfi %eax
- je nmi_espfix_stack
-#endif
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl_cfi %eax
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl_cfi %eax
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl_cfi %eax
- SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_all_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK 12, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-#ifdef CONFIG_X86_ESPFIX32
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl_cfi %ss
- pushl_cfi %esp
- addl $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl_cfi 16(%esp)
- .endr
- pushl_cfi %eax
- SAVE_ALL
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
-#endif
- CFI_ENDPROC
-END(nmi)
-
-ENTRY(int3)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $-1 # mark this as an int
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-END(int3)
-
-ENTRY(general_protection)
- RING0_EC_FRAME
- pushl_cfi $do_general_protection
- jmp error_code
- CFI_ENDPROC
-END(general_protection)
-
-#ifdef CONFIG_KVM_GUEST
-ENTRY(async_page_fault)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_async_page_fault
- jmp error_code
- CFI_ENDPROC
-END(async_page_fault)
-#endif
-
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
deleted file mode 100644
index 02c2eff7478d..000000000000
--- a/arch/x86/kernel/entry_64.S
+++ /dev/null
@@ -1,1653 +0,0 @@
-/*
- * linux/arch/x86_64/entry.S
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- *
- * Some of this is documented in Documentation/x86/entry_64.txt
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
- * A note on terminology:
- * - iret frame: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
- *
- * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - ENTRY/END Define functions in the symbol table.
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - idtentry - Define exception entry points.
- */
-
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/cache.h>
-#include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/msr.h>
-#include <asm/unistd.h>
-#include <asm/thread_info.h>
-#include <asm/hw_irq.h>
-#include <asm/page_types.h>
-#include <asm/irqflags.h>
-#include <asm/paravirt.h>
-#include <asm/percpu.h>
-#include <asm/asm.h>
-#include <asm/context_tracking.h>
-#include <asm/smap.h>
-#include <asm/pgtable_types.h>
-#include <linux/err.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE 0x40000000
-
- .code64
- .section .entry.text, "ax"
-
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret64)
- swapgs
- sysretq
-ENDPROC(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT */
-
-
-.macro TRACE_IRQS_IRETQ
-#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * When dynamic function tracer is enabled it will add a breakpoint
- * to all locations that it is about to modify, sync CPUs, update
- * all the code, sync CPUs, then remove the breakpoints. In this time
- * if lockdep is enabled, it might jump back into the debug handler
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
- *
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
- * make sure the stack pointer does not get reset back to the top
- * of the debug stack, and instead just reuses the current stack.
- */
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
-
-.macro TRACE_IRQS_OFF_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_OFF
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_ON_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_ON
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_IRETQ_DEBUG
- bt $9,EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON_DEBUG
-1:
-.endm
-
-#else
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
-#endif
-
-/*
- * empty frame
- */
- .macro EMPTY_FRAME start=1 offset=0
- .if \start
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,8+\offset
- .else
- CFI_DEF_CFA_OFFSET 8+\offset
- .endif
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, 5*8+\offset
- /*CFI_REL_OFFSET ss, 4*8+\offset*/
- CFI_REL_OFFSET rsp, 3*8+\offset
- /*CFI_REL_OFFSET rflags, 2*8+\offset*/
- /*CFI_REL_OFFSET cs, 1*8+\offset*/
- CFI_REL_OFFSET rip, 0*8+\offset
- .endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
- .macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, 1*8+\offset
- .endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
- .macro DEFAULT_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset
- CFI_REL_OFFSET rdi, RDI+\offset
- CFI_REL_OFFSET rsi, RSI+\offset
- CFI_REL_OFFSET rdx, RDX+\offset
- CFI_REL_OFFSET rcx, RCX+\offset
- CFI_REL_OFFSET rax, RAX+\offset
- CFI_REL_OFFSET r8, R8+\offset
- CFI_REL_OFFSET r9, R9+\offset
- CFI_REL_OFFSET r10, R10+\offset
- CFI_REL_OFFSET r11, R11+\offset
- CFI_REL_OFFSET rbx, RBX+\offset
- CFI_REL_OFFSET rbp, RBP+\offset
- CFI_REL_OFFSET r12, R12+\offset
- CFI_REL_OFFSET r13, R13+\offset
- CFI_REL_OFFSET r14, R14+\offset
- CFI_REL_OFFSET r15, R15+\offset
- .endm
-
-/*
- * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
- *
- * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
- *
- * Registers on entry:
- * rax system call number
- * rcx return address
- * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
- * rdi arg0
- * rsi arg1
- * rdx arg2
- * r10 arg3 (needs to be moved to rcx to conform to C ABI)
- * r8 arg4
- * r9 arg5
- * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
- *
- * Only called from user space.
- *
- * When user can change pt_regs->foo always force IRET. That is because
- * it deals with uncanonical addresses better. SYSRET has trouble
- * with them due to bugs in both AMD and Intel CPUs.
- */
-
-ENTRY(system_call)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,0
- CFI_REGISTER rip,rcx
- /*CFI_REGISTER rflags,r11*/
-
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-GLOBAL(system_call_after_swapgs)
-
- movq %rsp,PER_CPU_VAR(rsp_scratch)
- movq PER_CPU_VAR(kernel_stack),%rsp
-
- /* Construct struct pt_regs on stack */
- pushq_cfi $__USER_DS /* pt_regs->ss */
- pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- /*
- * Re-enable interrupts.
- * We use 'rsp_scratch' as a scratch space, hence irq-off block above
- * must execute atomically in the face of possible interrupt-driven
- * task preemption. We must enable interrupts only after we're done
- * with using rsp_scratch:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %r11 /* pt_regs->flags */
- pushq_cfi $__USER_CS /* pt_regs->cs */
- pushq_cfi %rcx /* pt_regs->ip */
- CFI_REL_OFFSET rip,0
- pushq_cfi_reg rax /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi $-ENOSYS /* pt_regs->ax */
- pushq_cfi_reg r8 /* pt_regs->r8 */
- pushq_cfi_reg r9 /* pt_regs->r9 */
- pushq_cfi_reg r10 /* pt_regs->r10 */
- pushq_cfi_reg r11 /* pt_regs->r11 */
- sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 6*8
-
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz tracesys
-system_call_fastpath:
-#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max,%rax
-#else
- andl $__SYSCALL_MASK,%eax
- cmpl $__NR_syscall_max,%eax
-#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10,%rcx
- call *sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
-1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
- LOCKDEP_SYS_EXIT
- /*
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
- DISABLE_INTERRUPTS(CLBR_NONE)
-
- /*
- * We must check ti flags with interrupts (or at least preemption)
- * off because we must *never* return to userspace without
- * processing exit work that is enqueued if we're preempted here.
- * In particular, returning to userspace with any of the one-shot
- * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
- * very bad.
- */
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
-
- CFI_REMEMBER_STATE
-
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RIP(%rsp),%rcx
- CFI_REGISTER rip,rcx
- movq EFLAGS(%rsp),%r11
- /*CFI_REGISTER rflags,r11*/
- movq RSP(%rsp),%rsp
- /*
- * 64bit SYSRET restores rip from rcx,
- * rflags from r11 (but RF and VM bits are forced to 0),
- * cs and ss are loaded from MSRs.
- * Restoration of rflags re-enables interrupts.
- *
- * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
- * descriptor is not reinitialized. This means that we should
- * avoid SYSRET with SS == NULL, which could happen if we schedule,
- * exit the kernel, and re-enter using an interrupt vector. (All
- * interrupt entries on x86_64 set SS to NULL.) We prevent that
- * from happening by reloading SS in __switch_to. (Actually
- * detecting the failure in 64-bit userspace is tricky but can be
- * done.)
- */
- USERGS_SYSRET64
-
- CFI_RESTORE_STATE
-
- /* Do syscall entry tracing */
-tracesys:
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- call syscall_trace_enter_phase1
- test %rax, %rax
- jnz tracesys_phase2 /* if needed, run the slow path */
- RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
- movq ORIG_RAX(%rsp), %rax
- jmp system_call_fastpath /* and return to the fast path */
-
-tracesys_phase2:
- SAVE_EXTRA_REGS
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- movq %rax,%rdx
- call syscall_trace_enter_phase2
-
- /*
- * Reload registers from stack in case ptrace changed them.
- * We don't reload %rax because syscall_trace_entry_phase2() returned
- * the value it wants us to use in the table lookup.
- */
- RESTORE_C_REGS_EXCEPT_RAX
- RESTORE_EXTRA_REGS
-#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max,%rax
-#else
- andl $__SYSCALL_MASK,%eax
- cmpl $__NR_syscall_max,%eax
-#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10,%rcx /* fixup for C */
- call *sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
-1:
- /* Use IRET because user could have changed pt_regs->foo */
-
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
- DISABLE_INTERRUPTS(CLBR_NONE)
-int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
- TRACE_IRQS_OFF
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: mask to check */
-GLOBAL(int_with_check)
- LOCKDEP_SYS_EXIT_IRQ
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp syscall_return
-
- /* Either reschedule or signal or syscall exit tracking needed. */
- /* First do a reschedule test. */
- /* edx: work, edi: workmask */
-int_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc int_very_careful
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
- SCHEDULE_USER
- popq_cfi %rdi
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
- /* handle signals and tracing -- both require a full pt_regs */
-int_very_careful:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_EXTRA_REGS
- /* Check for syscall exit trace */
- testl $_TIF_WORK_SYSCALL_EXIT,%edx
- jz int_signal
- pushq_cfi %rdi
- leaq 8(%rsp),%rdi # &ptregs -> arg1
- call syscall_trace_leave
- popq_cfi %rdi
- andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
- jmp int_restore_rest
-
-int_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz 1f
- movq %rsp,%rdi # &ptregs -> arg1
- xorl %esi,%esi # oldset -> arg2
- call do_notify_resume
-1: movl $_TIF_WORK_MASK,%edi
-int_restore_rest:
- RESTORE_EXTRA_REGS
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
-syscall_return:
- /* The IRETQ could re-enable interrupts: */
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_IRETQ
-
- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
- */
- movq RCX(%rsp),%rcx
- cmpq %rcx,RIP(%rsp) /* RCX == RIP */
- jne opportunistic_sysret_failed
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP. It's not worth
- * testing for canonicalness exactly -- this check detects any
- * of the 17 high bits set, which is true for non-canonical
- * or kernel addresses. (This will pessimize vsyscall=native.
- * Big deal.)
- *
- * If virtual addresses ever become wider, this will need
- * to be updated to remain correct on both old and new CPUs.
- */
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- SYSRET checks need update"
- .endif
- shr $__VIRTUAL_MASK_SHIFT, %rcx
- jnz opportunistic_sysret_failed
-
- cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
-
- movq R11(%rsp),%r11
- cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
-
- /*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
- *
- * movq $stuck_here,%rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
-
- /*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
- */
-syscall_return_via_sysret:
- CFI_REMEMBER_STATE
- /* r11 is already restored (see code above) */
- RESTORE_C_REGS_EXCEPT_R11
- movq RSP(%rsp),%rsp
- USERGS_SYSRET64
- CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
- SWAPGS
- jmp restore_c_regs_and_iret
- CFI_ENDPROC
-END(system_call)
-
-
- .macro FORK_LIKE func
-ENTRY(stub_\func)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8 /* offset 8: return address */
- SAVE_EXTRA_REGS 8
- jmp sys_\func
- CFI_ENDPROC
-END(stub_\func)
- .endm
-
- FORK_LIKE clone
- FORK_LIKE fork
- FORK_LIKE vfork
-
-ENTRY(stub_execve)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
- call sys_execve
-return_from_execve:
- testl %eax, %eax
- jz 1f
- /* exec failed, can use fast SYSRET code path in this case */
- ret
-1:
- /* must use IRET code path (pt_regs->cs may have changed) */
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
- ZERO_EXTRA_REGS
- movq %rax,RAX(%rsp)
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
- .align 8
-GLOBAL(stub_execveat)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
- call sys_execveat
- jmp return_from_execve
- CFI_ENDPROC
-END(stub_execveat)
-
-#ifdef CONFIG_X86_X32_ABI
- .align 8
-GLOBAL(stub_x32_execve)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
- call compat_sys_execve
- jmp return_from_execve
- CFI_ENDPROC
-END(stub_x32_execve)
- .align 8
-GLOBAL(stub_x32_execveat)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
- call compat_sys_execveat
- jmp return_from_execve
- CFI_ENDPROC
-END(stub_x32_execveat)
-#endif
-
-#ifdef CONFIG_IA32_EMULATION
- .align 8
-GLOBAL(stub32_execve)
- CFI_STARTPROC
- call compat_sys_execve
- jmp return_from_execve
- CFI_ENDPROC
-END(stub32_execve)
- .align 8
-GLOBAL(stub32_execveat)
- CFI_STARTPROC
- call compat_sys_execveat
- jmp return_from_execve
- CFI_ENDPROC
-END(stub32_execveat)
-#endif
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
- /*
- * SAVE_EXTRA_REGS result is not normally needed:
- * sigreturn overwrites all pt_regs->GPREGS.
- * But sigreturn can fail (!), and there is no easy way to detect that.
- * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
- * we SAVE_EXTRA_REGS here.
- */
- SAVE_EXTRA_REGS 8
- call sys_rt_sigreturn
-return_from_stub:
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
- RESTORE_EXTRA_REGS
- movq %rax,RAX(%rsp)
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_rt_sigreturn)
-
-#ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_rt_sigreturn)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
- SAVE_EXTRA_REGS 8
- call sys32_x32_rt_sigreturn
- jmp return_from_stub
- CFI_ENDPROC
-END(stub_x32_rt_sigreturn)
-#endif
-
-/*
- * A newly forked process directly context switches into this address.
- *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
-
- call schedule_tail # rdi: 'prev' task parameter
-
- RESTORE_EXTRA_REGS
-
- testl $3,CS(%rsp) # from kernel_thread?
-
- /*
- * By the time we get here, we have no idea whether our pt_regs,
- * ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the ia32entry paths.
- * Use IRET code path to return, since it can safely handle
- * all of the above.
- */
- jnz int_ret_from_sys_call
-
- /* We came from kernel_thread */
- /* nb: we depend on RESTORE_EXTRA_REGS above */
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
- RESTORE_EXTRA_REGS
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-ENTRY(irq_entries_start)
- INTR_FRAME
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- CFI_ADJUST_CFA_OFFSET -8
- .align 8
- .endr
- CFI_ENDPROC
-END(irq_entries_start)
-
-/*
- * Interrupt entry/exit.
- *
- * Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
-
-/* 0(%rsp): ~(interrupt number) */
- .macro interrupt func
- cld
- /*
- * Since nothing in interrupt handling code touches r12...r15 members
- * of "struct pt_regs", and since interrupts can nest, we can save
- * four stack slots and simultaneously provide
- * an unwind-friendly stack layout by saving "truncated" pt_regs
- * exactly up to rbp slot, without these members.
- */
- ALLOC_PT_GPREGS_ON_STACK -RBP
- SAVE_C_REGS -RBP
- /* this goes to 0(%rsp) for unwinder, not for saving the value: */
- SAVE_EXTRA_REGS_RBP -RBP
-
- leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
-
- testl $3, CS-RBP(%rsp)
- je 1f
- SWAPGS
-1:
- /*
- * Save previous stack pointer, optionally switch to interrupt stack.
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
- movq %rsp, %rsi
- incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
- pushq %rsi
- /*
- * For debugger:
- * "CFA (Current Frame Address) is the value on stack + offset"
- */
- CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 (rsp) */, 0, \
- 0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
- 0x22 /* DW_OP_plus */
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
-
- call \func
- .endm
-
- /*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_interrupt.
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- XCPT_FRAME
- ASM_CLAC
- addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
- interrupt do_IRQ
- /* 0(%rsp): old RSP */
-ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
-
- /* Restore saved previous stack */
- popq %rsi
- CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
- /* return code expects complete pt_regs - adjust rsp accordingly: */
- leaq -RBP(%rsi),%rsp
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET RBP
-
- testl $3,CS(%rsp)
- je retint_kernel
- /* Interrupt came from user space */
-
- GET_THREAD_INFO(%rcx)
- /*
- * %rcx: thread info. Interrupts off.
- */
-retint_with_reschedule:
- movl $_TIF_WORK_MASK,%edi
-retint_check:
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- CFI_REMEMBER_STATE
- jnz retint_careful
-
-retint_swapgs: /* return to user-space */
- /*
- * The iretq could re-enable interrupts:
- */
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_IRETQ
-
- SWAPGS
- jmp restore_c_regs_and_iret
-
-/* Returning to kernel space */
-retint_kernel:
-#ifdef CONFIG_PREEMPT
- /* Interrupts are off */
- /* Check if we need preemption */
- bt $9,EFLAGS(%rsp) /* interrupts were off? */
- jnc 1f
-0: cmpl $0,PER_CPU_VAR(__preempt_count)
- jnz 1f
- call preempt_schedule_irq
- jmp 0b
-1:
-#endif
- /*
- * The iretq could re-enable interrupts:
- */
- TRACE_IRQS_IRETQ
-
-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-restore_c_regs_and_iret:
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
-
-irq_return:
- INTERRUPT_RETURN
-
-ENTRY(native_iret)
- /*
- * Are we returning to a stack segment from the LDT? Note: in
- * 64-bit mode SS:RSP on the exception stack is always valid.
- */
-#ifdef CONFIG_X86_ESPFIX64
- testb $4,(SS-RIP)(%rsp)
- jnz native_irq_return_ldt
-#endif
-
-.global native_irq_return_iret
-native_irq_return_iret:
- /*
- * This may fault. Non-paranoid faults on return to userspace are
- * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
- * Double-faults due to espfix64 are handled in do_double_fault.
- * Other faults here are fatal.
- */
- iretq
-
-#ifdef CONFIG_X86_ESPFIX64
-native_irq_return_ldt:
- pushq_cfi %rax
- pushq_cfi %rdi
- SWAPGS
- movq PER_CPU_VAR(espfix_waddr),%rdi
- movq %rax,(0*8)(%rdi) /* RAX */
- movq (2*8)(%rsp),%rax /* RIP */
- movq %rax,(1*8)(%rdi)
- movq (3*8)(%rsp),%rax /* CS */
- movq %rax,(2*8)(%rdi)
- movq (4*8)(%rsp),%rax /* RFLAGS */
- movq %rax,(3*8)(%rdi)
- movq (6*8)(%rsp),%rax /* SS */
- movq %rax,(5*8)(%rdi)
- movq (5*8)(%rsp),%rax /* RSP */
- movq %rax,(4*8)(%rdi)
- andl $0xffff0000,%eax
- popq_cfi %rdi
- orq PER_CPU_VAR(espfix_stack),%rax
- SWAPGS
- movq %rax,%rsp
- popq_cfi %rax
- jmp native_irq_return_iret
-#endif
-
- /* edi: workmask, edx: work */
-retint_careful:
- CFI_RESTORE_STATE
- bt $TIF_NEED_RESCHED,%edx
- jnc retint_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
- SCHEDULE_USER
- popq_cfi %rdi
- GET_THREAD_INFO(%rcx)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp retint_check
-
-retint_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz retint_swapgs
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_EXTRA_REGS
- movq $-1,ORIG_RAX(%rsp)
- xorl %esi,%esi # oldset
- movq %rsp,%rdi # &pt_regs
- call do_notify_resume
- RESTORE_EXTRA_REGS
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- jmp retint_with_reschedule
-
- CFI_ENDPROC
-END(common_interrupt)
-
-/*
- * APIC interrupts.
- */
-.macro apicinterrupt3 num sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- pushq_cfi $~(\num)
-.Lcommon_\sym:
- interrupt \do_sym
- jmp ret_from_intr
- CFI_ENDPROC
-END(\sym)
-.endm
-
-#ifdef CONFIG_TRACING
-#define trace(sym) trace_##sym
-#define smp_trace(sym) smp_trace_##sym
-
-.macro trace_apicinterrupt num sym
-apicinterrupt3 \num trace(\sym) smp_trace(\sym)
-.endm
-#else
-.macro trace_apicinterrupt num sym do_sym
-.endm
-#endif
-
-.macro apicinterrupt num sym do_sym
-apicinterrupt3 \num \sym \do_sym
-trace_apicinterrupt \num \sym
-.endm
-
-#ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
- irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR \
- reboot_interrupt smp_reboot_interrupt
-#endif
-
-#ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE \
- uv_bau_message_intr1 uv_bau_message_interrupt
-#endif
-apicinterrupt LOCAL_TIMER_VECTOR \
- apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR \
- x86_platform_ipi smp_x86_platform_ipi
-
-#ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR \
- kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
-#endif
-
-#ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR \
- threshold_interrupt smp_threshold_interrupt
-#endif
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR \
- thermal_interrupt smp_thermal_interrupt
-#endif
-
-#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
- call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR \
- call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR \
- reschedule_interrupt smp_reschedule_interrupt
-#endif
-
-apicinterrupt ERROR_APIC_VECTOR \
- error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR \
- spurious_interrupt smp_spurious_interrupt
-
-#ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR \
- irq_work_interrupt smp_irq_work_interrupt
-#endif
-
-/*
- * Exception entry points.
- */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
-
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
-ENTRY(\sym)
- /* Sanity check */
- .if \shift_ist != -1 && \paranoid == 0
- .error "using shift_ist requires paranoid=1"
- .endif
-
- .if \has_error_code
- XCPT_FRAME
- .else
- INTR_FRAME
- .endif
-
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
-
- .ifeq \has_error_code
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- .endif
-
- ALLOC_PT_GPREGS_ON_STACK
-
- .if \paranoid
- .if \paranoid == 1
- CFI_REMEMBER_STATE
- testl $3, CS(%rsp) /* If coming from userspace, switch */
- jnz 1f /* stacks. */
- .endif
- call paranoid_entry
- .else
- call error_entry
- .endif
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
-
- DEFAULT_FRAME 0
-
- .if \paranoid
- .if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
- .else
- TRACE_IRQS_OFF
- .endif
- .endif
-
- movq %rsp,%rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi,%esi /* no error code */
- .endif
-
- .if \shift_ist != -1
- subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
- .endif
-
- call \do_sym
-
- .if \shift_ist != -1
- addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
- .endif
-
- /* these procedures expect "no swapgs" flag in ebx */
- .if \paranoid
- jmp paranoid_exit
- .else
- jmp error_exit
- .endif
-
- .if \paranoid == 1
- CFI_RESTORE_STATE
- /*
- * Paranoid entry from userspace. Switch stacks and treat it
- * as a normal entry. This means that paranoid handlers
- * run in real process context if user_mode(regs).
- */
-1:
- call error_entry
-
- DEFAULT_FRAME 0
-
- movq %rsp,%rdi /* pt_regs pointer */
- call sync_regs
- movq %rax,%rsp /* switch stack */
-
- movq %rsp,%rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi,%esi /* no error code */
- .endif
-
- call \do_sym
-
- jmp error_exit /* %ebx: no swapgs flag */
- .endif
-
- CFI_ENDPROC
-END(\sym)
-.endm
-
-#ifdef CONFIG_TRACING
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#else
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#endif
-
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
- /* Reload gs selector with exception handling */
- /* edi: new selector */
-ENTRY(native_load_gs_index)
- CFI_STARTPROC
- pushfq_cfi
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
- SWAPGS
-gs_change:
- movl %edi,%gs
-2: mfence /* workaround */
- SWAPGS
- popfq_cfi
- ret
- CFI_ENDPROC
-END(native_load_gs_index)
-
- _ASM_EXTABLE(gs_change,bad_gs)
- .section .fixup,"ax"
- /* running with kernelgs */
-bad_gs:
- SWAPGS /* switch back to user gs */
- xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
- .previous
-
-/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(do_softirq_own_stack)
- CFI_STARTPROC
- pushq_cfi %rbp
- CFI_REL_OFFSET rbp,0
- mov %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr),%rsp
- push %rbp # backlink for old unwinder
- call __do_softirq
- leaveq
- CFI_RESTORE rbp
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
- decl PER_CPU_VAR(irq_count)
- ret
- CFI_ENDPROC
-END(do_softirq_own_stack)
-
-#ifdef CONFIG_XEN
-idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
-
-/*
- * A note on the "critical region" in our callback handler.
- * We want to avoid stacking callback handlers due to events occurring
- * during handling of the last event. To do this, we keep events disabled
- * until we've done all processing. HOWEVER, we must enable events before
- * popping the stack frame (can't be done atomically) and so it would still
- * be possible to get enough handler activations to overflow the stack.
- * Although unlikely, bugs of that kind are hard to track down, so we'd
- * like to avoid the possibility.
- * So, on entry to the handler we detect whether we interrupted an
- * existing activation in its critical region -- if so, we pop the current
- * activation and restart the handler using the previous one.
- */
-ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
- CFI_STARTPROC
-/*
- * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
- * see the correct pointer to the pt_regs
- */
- movq %rdi, %rsp # we don't return, adjust the stack frame
- CFI_ENDPROC
- DEFAULT_FRAME
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- pushq %rbp # backlink for old unwinder
- call xen_evtchn_do_upcall
- popq %rsp
- CFI_DEF_CFA_REGISTER rsp
- decl PER_CPU_VAR(irq_count)
-#ifndef CONFIG_PREEMPT
- call xen_maybe_preempt_hcall
-#endif
- jmp error_exit
- CFI_ENDPROC
-END(xen_do_hypervisor_callback)
-
-/*
- * Hypervisor uses this for application faults while it executes.
- * We get here for two reasons:
- * 1. Fault while reloading DS, ES, FS or GS
- * 2. Fault while executing IRET
- * Category 1 we do not need to fix up as Xen has already reloaded all segment
- * registers that could be reloaded and zeroed the others.
- * Category 2 we fix up by killing the current process. We cannot use the
- * normal Linux return path in this case because if we use the IRET hypercall
- * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- * We distinguish between categories by comparing each saved segment register
- * with its current contents: any discrepancy means we in category 1.
- */
-ENTRY(xen_failsafe_callback)
- INTR_FRAME 1 (6*8)
- /*CFI_REL_OFFSET gs,GS*/
- /*CFI_REL_OFFSET fs,FS*/
- /*CFI_REL_OFFSET es,ES*/
- /*CFI_REL_OFFSET ds,DS*/
- CFI_REL_OFFSET r11,8
- CFI_REL_OFFSET rcx,0
- movw %ds,%cx
- cmpw %cx,0x10(%rsp)
- CFI_REMEMBER_STATE
- jne 1f
- movw %es,%cx
- cmpw %cx,0x18(%rsp)
- jne 1f
- movw %fs,%cx
- cmpw %cx,0x20(%rsp)
- jne 1f
- movw %gs,%cx
- cmpw %cx,0x28(%rsp)
- jne 1f
- /* All segments match their saved values => Category 2 (Bad IRET). */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0 /* RIP */
- pushq_cfi %r11
- pushq_cfi %rcx
- jmp general_protection
- CFI_RESTORE_STATE
-1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $-1 /* orig_ax = -1 => not a system call */
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
- jmp error_exit
- CFI_ENDPROC
-END(xen_failsafe_callback)
-
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- xen_hvm_callback_vector xen_evtchn_do_upcall
-
-#endif /* CONFIG_XEN */
-
-#if IS_ENABLED(CONFIG_HYPERV)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- hyperv_callback_vector hyperv_vector_handler
-#endif /* CONFIG_HYPERV */
-
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry stack_segment do_stack_segment has_error_code=1
-#ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
-#endif
-idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
-#endif
-#ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
-#endif
-
-/*
- * Save all registers in pt_regs, and switch gs if needed.
- * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
- */
-ENTRY(paranoid_entry)
- XCPT_FRAME 1 15*8
- cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(paranoid_entry)
-
-/*
- * "Paranoid" exit path from exception stack. This is invoked
- * only on return from non-NMI IST interrupts that came
- * from kernel space.
- *
- * We may be returning to very strange contexts (e.g. very early
- * in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
- */
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
-ENTRY(paranoid_exit)
- DEFAULT_FRAME
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF_DEBUG
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
- SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
-paranoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
- INTERRUPT_RETURN
- CFI_ENDPROC
-END(paranoid_exit)
-
-/*
- * Save all registers in pt_regs, and switch gs if needed.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
- */
-ENTRY(error_entry)
- XCPT_FRAME 1 15*8
- cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
- xorl %ebx,%ebx
- testl $3,CS+8(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- ret
-
- /*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
-error_kernelspace:
- CFI_REL_OFFSET rcx, RCX+8
- incl %ebx
- leaq native_irq_return_iret(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_bad_iret
- movl %ecx,%eax /* zero extend */
- cmpq %rax,RIP+8(%rsp)
- je bstep_iret
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
-
-bstep_iret:
- /* Fix truncated RIP */
- movq %rcx,RIP+8(%rsp)
- /* fall through */
-
-error_bad_iret:
- SWAPGS
- mov %rsp,%rdi
- call fixup_bad_iret
- mov %rax,%rsp
- decl %ebx /* Return to usergs */
- jmp error_sti
- CFI_ENDPROC
-END(error_entry)
-
-
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
-ENTRY(error_exit)
- DEFAULT_FRAME
- movl %ebx,%eax
- RESTORE_EXTRA_REGS
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
- CFI_ENDPROC
-END(error_exit)
-
-/* Runs on exception stack */
-ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- /*
- * We allow breakpoints in NMIs. If a breakpoint occurs, then
- * the iretq it performs will take us out of NMI context.
- * This means that we can have nested NMIs where the next
- * NMI is using the top of the stack of the previous NMI. We
- * can't let it execute because the nested NMI will corrupt the
- * stack of the previous NMI. NMI handlers are not re-entrant
- * anyway.
- *
- * To handle this case we do the following:
- * Check the a special location on the stack that contains
- * a variable that is set when NMIs are executing.
- * The interrupted task's stack is also checked to see if it
- * is an NMI stack.
- * If the variable is not set and the stack is not the NMI
- * stack then:
- * o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
- * o Continue processing the NMI
- * If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
- * o return back to the first NMI
- *
- * Now on exit of the first NMI, we first clear the stack variable
- * The NMI stack will tell any nested NMIs at that point that it is
- * nested. Then we pop the stack normally with iret, and if there was
- * a nested NMI that updated the copy interrupt stack frame, a
- * jump will be made to the repeat_nmi code that will handle the second
- * NMI.
- */
-
- /* Use %rdx as our temp variable throughout */
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
-
- /*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
- */
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
-
- /*
- * Check the special variable on the stack to see if NMIs are
- * executing.
- */
- cmpl $1, -8(%rsp)
- je nested_nmi
-
- /*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
- */
- lea 6*8(%rsp), %rdx
- /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
- cmpq %rdx, 4*8(%rsp)
- /* If the stack pointer is above the NMI stack, this is a normal NMI */
- ja first_nmi
- subq $EXCEPTION_STKSZ, %rdx
- cmpq %rdx, 4*8(%rsp)
- /* If it is below the NMI stack, it is a normal NMI */
- jb first_nmi
- /* Ah, it is within the NMI stack, treat it as nested */
-
- CFI_REMEMBER_STATE
-
-nested_nmi:
- /*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
- */
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
- leaq -1*8(%rsp), %rdx
- movq %rdx, %rsp
- CFI_ADJUST_CFA_OFFSET 1*8
- leaq -10*8(%rsp), %rdx
- pushq_cfi $__KERNEL_DS
- pushq_cfi %rdx
- pushfq_cfi
- pushq_cfi $__KERNEL_CS
- pushq_cfi $repeat_nmi
-
- /* Put stack back */
- addq $(6*8), %rsp
- CFI_ADJUST_CFA_OFFSET -6*8
-
-nested_nmi_out:
- popq_cfi %rdx
- CFI_RESTORE rdx
-
- /* No need to check faults here */
- INTERRUPT_RETURN
-
- CFI_RESTORE_STATE
-first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
- movq (%rsp), %rdx
- CFI_RESTORE rdx
-
- /* Set the NMI executing variable on the stack. */
- pushq_cfi $1
-
- /*
- * Leave room for the "copied" frame
- */
- subq $(5*8), %rsp
- CFI_ADJUST_CFA_OFFSET 5*8
-
- /* Copy the stack frame to the Saved frame */
- .rept 5
- pushq_cfi 11*8(%rsp)
- .endr
- CFI_DEF_CFA_OFFSET 5*8
-
- /* Everything up to here is safe from nested NMIs */
-
- /*
- * If there was a nested NMI, the first NMI's iret will return
- * here. But NMIs are still enabled and we can take another
- * nested NMI. The nested NMI checks the interrupted RIP to see
- * if it is between repeat_nmi and end_repeat_nmi, and if so
- * it will just return, as we are about to repeat an NMI anyway.
- * This makes it safe to copy to the stack frame that a nested
- * NMI will update.
- */
-repeat_nmi:
- /*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
- */
- movq $1, 10*8(%rsp)
-
- /* Make another copy, this one may be modified by nested NMIs */
- addq $(10*8), %rsp
- CFI_ADJUST_CFA_OFFSET -10*8
- .rept 5
- pushq_cfi -6*8(%rsp)
- .endr
- subq $(5*8), %rsp
- CFI_DEF_CFA_OFFSET 5*8
-end_repeat_nmi:
-
- /*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
- */
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- ALLOC_PT_GPREGS_ON_STACK
-
- /*
- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
- * as we should not be calling schedule in NMI context.
- * Even with normal interrupts enabled. An NMI should not be
- * setting NEED_RESCHED or anything that normal interrupts and
- * exceptions might do.
- */
- call paranoid_entry
- DEFAULT_FRAME 0
-
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp,%rdi
- movq $-1,%rsi
- call do_nmi
-
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
-
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
-nmi_swapgs:
- SWAPGS_UNSAFE_STACK
-nmi_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- /* Pop the extra iret frame at once */
- REMOVE_PT_GPREGS_FROM_STACK 6*8
-
- /* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
- jmp irq_return
- CFI_ENDPROC
-END(nmi)
-
-ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-END(ignore_sysret)
-
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7e429c99c728..0e2d96ffd158 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -557,7 +557,7 @@ early_idt_handler_common:
cld
cmpl $2,(%esp) # X86_TRAP_NMI
- je is_nmi # Ignore NMI
+ je .Lis_nmi # Ignore NMI
cmpl $2,%ss:early_recursion_flag
je hlt_loop
@@ -610,7 +610,7 @@ ex_entry:
pop %ecx
pop %eax
decl %ss:early_recursion_flag
-is_nmi:
+.Lis_nmi:
addl $8,%esp /* drop vector number and error code */
iret
ENDPROC(early_idt_handler_common)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index df7e78057ae0..e5c27f729a38 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -346,7 +346,7 @@ early_idt_handler_common:
cld
cmpl $2,(%rsp) # X86_TRAP_NMI
- je is_nmi # Ignore NMI
+ je .Lis_nmi # Ignore NMI
cmpl $2,early_recursion_flag(%rip)
jz 1f
@@ -411,7 +411,7 @@ early_idt_handler_common:
popq %rcx
popq %rax
decl early_recursion_flag(%rip)
-is_nmi:
+.Lis_nmi:
addq $16,%rsp # drop vector number and error code
INTERRUPT_RETURN
ENDPROC(early_idt_handler_common)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3acbff4716b0..10757d0a3fcf 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
#include <linux/pm.h>
#include <linux/io.h>
+#include <asm/irqdomain.h>
#include <asm/fixmap.h>
#include <asm/hpet.h>
#include <asm/time.h>
@@ -305,8 +306,6 @@ static void hpet_legacy_clockevent_register(void)
printk(KERN_DEBUG "hpet clockevent registered\n");
}
-static int hpet_setup_msi_irq(unsigned int irq);
-
static void hpet_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt, int timer)
{
@@ -357,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
hpet_enable_legacy_int();
} else {
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_setup_msi_irq(hdev->irq);
+ irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
disable_irq(hdev->irq);
irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
enable_irq(hdev->irq);
@@ -423,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta,
static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
static struct hpet_dev *hpet_devs;
+static struct irq_domain *hpet_domain;
void hpet_msi_unmask(struct irq_data *data)
{
@@ -473,31 +473,6 @@ static int hpet_msi_next_event(unsigned long delta,
return hpet_next_event(delta, evt, hdev->num);
}
-static int hpet_setup_msi_irq(unsigned int irq)
-{
- if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
- irq_free_hwirq(irq);
- return -EINVAL;
- }
- return 0;
-}
-
-static int hpet_assign_irq(struct hpet_dev *dev)
-{
- unsigned int irq = irq_alloc_hwirq(-1);
-
- if (!irq)
- return -EINVAL;
-
- irq_set_handler_data(irq, dev);
-
- if (hpet_setup_msi_irq(irq))
- return -EINVAL;
-
- dev->irq = irq;
- return 0;
-}
-
static irqreturn_t hpet_interrupt_handler(int irq, void *data)
{
struct hpet_dev *dev = (struct hpet_dev *)data;
@@ -540,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
if (!(hdev->flags & HPET_DEV_VALID))
return;
- if (hpet_setup_msi_irq(hdev->irq))
- return;
-
hdev->cpu = cpu;
per_cpu(cpu_hpet_dev, cpu) = hdev;
evt->name = hdev->name;
@@ -574,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
unsigned int id;
unsigned int num_timers;
unsigned int num_timers_used = 0;
- int i;
+ int i, irq;
if (hpet_msi_disable)
return;
@@ -587,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
num_timers++; /* Value read out starts from 0 */
hpet_print_config();
+ hpet_domain = hpet_create_irq_domain(hpet_blockid);
+ if (!hpet_domain)
+ return;
+
hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
if (!hpet_devs)
return;
@@ -604,12 +580,14 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
hdev->flags = 0;
if (cfg & HPET_TN_PERIODIC_CAP)
hdev->flags |= HPET_DEV_PERI_CAP;
+ sprintf(hdev->name, "hpet%d", i);
hdev->num = i;
- sprintf(hdev->name, "hpet%d", i);
- if (hpet_assign_irq(hdev))
+ irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+ if (irq <= 0)
continue;
+ hdev->irq = irq;
hdev->flags |= HPET_DEV_FSB_CAP;
hdev->flags |= HPET_DEV_VALID;
num_timers_used++;
@@ -709,10 +687,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
}
#else
-static int hpet_setup_msi_irq(unsigned int irq)
-{
- return 0;
-}
static void hpet_msi_capability_lookup(unsigned int start_timer)
{
return;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index e7cc5370cd2f..16cb827a5b27 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi)
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
- outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
/* 8259A-1 (the master) has a slave on IR2 */
outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi)
outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
- outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
/* 8259A-2 is a slave on master's IR2 */
outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
/* (slave's support for AEOI in flat mode is to be investigated) */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e5952c225532..88b366487b0e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,12 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
atomic_t irq_err_count;
/* Function pointer for generic interrupt vector handling */
@@ -116,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_puts(p, " Threshold APIC interrupts\n");
#endif
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
+#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
@@ -136,6 +148,18 @@ int arch_show_interrupts(struct seq_file *p, int prec)
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
+#ifdef CONFIG_HAVE_KVM
+ seq_printf(p, "%*s: ", prec, "PIN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+ seq_puts(p, " Posted-interrupt notification event\n");
+
+ seq_printf(p, "%*s: ", prec, "PIW");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+ seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
return 0;
}
@@ -192,8 +216,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
unsigned vector = ~regs->orig_ax;
unsigned irq;
- irq_enter();
- exit_idle();
+ entering_irq();
irq = __this_cpu_read(vector_irq[vector]);
@@ -209,7 +232,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
}
}
- irq_exit();
+ exiting_irq();
set_irq_regs(old_regs);
return 1;
@@ -237,6 +260,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
}
#ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+ if (handler)
+ kvm_posted_intr_wakeup_handler = handler;
+ else
+ kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
@@ -244,16 +279,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- ack_APIC_irq();
-
- irq_enter();
-
- exit_idle();
-
+ entering_ack_irq();
inc_irq_stat(kvm_posted_intr_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
- irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+ kvm_posted_intr_wakeup_handler();
+ exiting_irq();
set_irq_regs(old_regs);
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index f9fd86a7fcc7..cd74f5978ab9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -21,12 +21,6 @@
#include <asm/apic.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
#ifdef CONFIG_DEBUG_STACKOVERFLOW
int sysctl_panic_on_stackoverflow __read_mostly;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 394e643d7830..bc4604e500a3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,12 +20,6 @@
#include <asm/idle.h>
#include <asm/apic.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
int sysctl_panic_on_stackoverflow;
/*
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 15d741ddfeeb..dc5fa6a1e8d6 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -10,12 +10,6 @@
#include <asm/apic.h>
#include <asm/trace/irq_vectors.h>
-static inline void irq_work_entering_irq(void)
-{
- irq_enter();
- ack_APIC_irq();
-}
-
static inline void __smp_irq_work_interrupt(void)
{
inc_irq_stat(apic_irq_work_irqs);
@@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void)
__visible void smp_irq_work_interrupt(struct pt_regs *regs)
{
- irq_work_entering_irq();
+ ipi_entering_ack_irq();
__smp_irq_work_interrupt();
exiting_irq();
}
__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
{
- irq_work_entering_irq();
+ ipi_entering_ack_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
__smp_irq_work_interrupt();
trace_irq_work_exit(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index cd10a6437264..a3a5e158ed69 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -86,7 +86,7 @@ void __init init_IRQ(void)
int i;
/*
- * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+ * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
* If these IRQ's are handled by legacy interrupt-controllers like PIC,
* then this configuration will likely be static after the boot. If
* these IRQ's are handled by more mordern controllers like IO-APIC,
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
* irq's migrate etc.
*/
for (i = 0; i < nr_legacy_irqs(); i++)
- per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
x86_init.irqs.intr_init();
}
@@ -135,6 +135,10 @@ static void __init apic_intr_init(void)
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
+#ifdef CONFIG_X86_MCE_AMD
+ alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
+#endif
+
#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -144,6 +148,8 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_HAVE_KVM
/* IPI for KVM to deliver posted interrupt */
alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+ /* IPI for KVM to deliver interrupt to wake up tasks */
+ alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
#endif
/* IPI vectors for APIC spurious and error interrupts */
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 415480d3ea84..11546b462fa6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
#include <linux/ftrace.h>
#include <linux/io.h>
#include <linux/suspend.h>
+#include <linux/vmalloc.h>
#include <asm/init.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 2d2a237f2c73..30ca7607cbbb 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,8 +19,8 @@
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
-#include <linux/irqdomain.h>
+#include <asm/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
@@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
pr_warn("Unknown bustype %s - ignoring\n", str);
}
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
- .map = mp_irqdomain_map,
- .unmap = mp_irqdomain_unmap,
-};
-
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
struct ioapic_domain_cfg cfg = {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c614dd492f5f..58bcfb67c01f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_ident_64(insnbuf, len);
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+#ifdef CONFIG_X86_32
type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+#endif
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
/* If operation requires a jmp, then jmp */
@@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.load_sp0 = native_load_sp0,
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_32)
.irq_enable_sysexit = native_irq_enable_sysexit,
#endif
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index a1fa86782186..8aa05583bc42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -55,7 +55,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
PATCH_SITE(pv_cpu_ops, swapgs);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index deff651835b4..c09c99ccf3e3 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -303,13 +303,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
arch_end_context_switch(next_p);
/*
- * Reload esp0, kernel_stack, and current_top_of_stack. This changes
+ * Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info().
*/
load_sp0(tss, next);
- this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c50e013b57d2..843f92e4c711 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -410,9 +410,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);
- this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
-
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cba828892790..265a6fdea8b7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1224,8 +1224,7 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
init_apic_mappings();
- if (x86_io_apic_ops.init)
- x86_io_apic_ops.init();
+ io_apic_init_mappings();
kvm_guest_init();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index be8e1bde07aa..15aaa69bbb5e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
asmlinkage __visible void smp_reboot_interrupt(void)
{
- ack_APIC_irq();
- irq_enter();
+ ipi_entering_ack_irq();
stop_this_cpu(NULL);
irq_exit();
}
@@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
*/
}
-static inline void smp_entering_irq(void)
-{
- ack_APIC_irq();
- irq_enter();
-}
-
__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
{
/*
@@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
* scheduler_ipi(). This is OK, since those functions are allowed
* to nest.
*/
- smp_entering_irq();
+ ipi_entering_ack_irq();
trace_reschedule_entry(RESCHEDULE_VECTOR);
__smp_reschedule_interrupt();
trace_reschedule_exit(RESCHEDULE_VECTOR);
@@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void)
__visible void smp_call_function_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ ipi_entering_ack_irq();
__smp_call_function_interrupt();
exiting_irq();
}
__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ ipi_entering_ack_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
__smp_call_function_interrupt();
trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void)
__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ ipi_entering_ack_irq();
__smp_call_function_single_interrupt();
exiting_irq();
}
__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ ipi_entering_ack_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
__smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6d4bfea25874..8add66b22f33 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -515,6 +515,40 @@ void __inquire_remote_apic(int apicid)
}
/*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UDELAY_10MS_DEFAULT;
+
+static int __init cpu_init_udelay(char *str)
+{
+ get_option(&str, &init_udelay);
+
+ return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+ /* if cmdline changed it from default, leave it alone */
+ if (init_udelay != UDELAY_10MS_DEFAULT)
+ return;
+
+ /* if modern processor, use no delay */
+ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+ init_udelay = 0;
+}
+
+/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
@@ -556,7 +590,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
static int
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
- unsigned long send_status, accept_status = 0;
+ unsigned long send_status = 0, accept_status = 0;
int maxlvt, num_starts, j;
maxlvt = lapic_get_maxlvt();
@@ -584,7 +618,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
- mdelay(10);
+ udelay(init_udelay);
pr_debug("Deasserting INIT\n");
@@ -652,6 +686,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
+
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
@@ -793,8 +828,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
- per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(idle) + THREAD_SIZE;
}
/*
@@ -1177,6 +1210,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
uv_system_init();
set_mtrr_aps_delayed_init();
+
+ smp_quirk_init_udelay();
}
void arch_enable_nonboot_cpus_begin(void)
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
deleted file mode 100644
index 3777189c4a19..000000000000
--- a/arch/x86/kernel/syscall_32.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* System call table for i386. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#define ia32_sys_call_table sys_call_table
-#define __NR_ia32_syscall_max __NR_syscall_max
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
-#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
-
-typedef asmlinkage void (*sys_call_ptr_t)(void);
-
-extern asmlinkage void sys_ni_syscall(void);
-
-__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
-#include <asm/syscalls_32.h>
-};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
deleted file mode 100644
index 4ac730b37f0b..000000000000
--- a/arch/x86/kernel/syscall_64.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* System call table for x86-64. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-#include <asm/syscall.h>
-
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
-#include <asm/syscalls_64.h>
-#undef __SYSCALL_64
-
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
-
-extern void sys_ni_syscall(void);
-
-asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/syscalls_64.h>
-};
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 36cb15b7b367..f5791927aa64 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -73,8 +73,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-
-asmlinkage int system_call(void);
+#include <asm/proto.h>
#endif
/* Must be page-aligned because the real IDT is used in a fixmap. */
@@ -769,18 +768,6 @@ dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
conditional_sti(regs);
-#if 0
- /* No need to warn about this any longer. */
- pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
}
dotraplinkage void
@@ -906,13 +893,13 @@ void __init trap_init(void)
set_bit(i, used_vectors);
#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
- set_system_trap_gate(SYSCALL_VECTOR, &system_call);
- set_bit(SYSCALL_VECTOR, used_vectors);
+ set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
/*
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
deleted file mode 100644
index 2dcc6ff6fdcc..000000000000
--- a/arch/x86/kernel/vsyscall_64.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
- *
- * Based on the original implementation which is:
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright 2003 Andi Kleen, SuSE Labs.
- *
- * Parts of the original code have been moved to arch/x86/vdso/vma.c
- *
- * This file implements vsyscall emulation. vsyscalls are a legacy ABI:
- * Userspace can request certain kernel services by calling fixed
- * addresses. This concept is problematic:
- *
- * - It interferes with ASLR.
- * - It's awkward to write code that lives in kernel addresses but is
- * callable by userspace at fixed addresses.
- * - The whole concept is impossible for 32-bit compat userspace.
- * - UML cannot easily virtualize a vsyscall.
- *
- * As of mid-2014, I believe that there is no new userspace code that
- * will use a vsyscall if the vDSO is present. I hope that there will
- * soon be no new userspace code that will ever use a vsyscall.
- *
- * The code in this file emulates vsyscalls when notified of a page
- * fault to a vsyscall address.
- */
-
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/syscalls.h>
-#include <linux/ratelimit.h>
-
-#include <asm/vsyscall.h>
-#include <asm/unistd.h>
-#include <asm/fixmap.h>
-#include <asm/traps.h>
-
-#define CREATE_TRACE_POINTS
-#include "vsyscall_trace.h"
-
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
-
-static int __init vsyscall_setup(char *str)
-{
- if (str) {
- if (!strcmp("emulate", str))
- vsyscall_mode = EMULATE;
- else if (!strcmp("native", str))
- vsyscall_mode = NATIVE;
- else if (!strcmp("none", str))
- vsyscall_mode = NONE;
- else
- return -EINVAL;
-
- return 0;
- }
-
- return -EINVAL;
-}
-early_param("vsyscall", vsyscall_setup);
-
-static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
- const char *message)
-{
- if (!show_unhandled_signals)
- return;
-
- printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
-}
-
-static int addr_to_vsyscall_nr(unsigned long addr)
-{
- int nr;
-
- if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
- return -EINVAL;
-
- nr = (addr & 0xC00UL) >> 10;
- if (nr >= 3)
- return -EINVAL;
-
- return nr;
-}
-
-static bool write_ok_or_segv(unsigned long ptr, size_t size)
-{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_error, this could go away.
- */
-
- if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
- siginfo_t info;
- struct thread_struct *thread = &current->thread;
-
- thread->error_code = 6; /* user fault, no page, write */
- thread->cr2 = ptr;
- thread->trap_nr = X86_TRAP_PF;
-
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = (void __user *)ptr;
-
- force_sig_info(SIGSEGV, &info, current);
- return false;
- } else {
- return true;
- }
-}
-
-bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
-{
- struct task_struct *tsk;
- unsigned long caller;
- int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_error;
- long ret;
-
- /*
- * No point in checking CS -- the only way to get here is a user mode
- * trap to a high address, which means that we're in 64-bit user code.
- */
-
- WARN_ON_ONCE(address != regs->ip);
-
- if (vsyscall_mode == NONE) {
- warn_bad_vsyscall(KERN_INFO, regs,
- "vsyscall attempted with vsyscall=none");
- return false;
- }
-
- vsyscall_nr = addr_to_vsyscall_nr(address);
-
- trace_emulate_vsyscall(vsyscall_nr);
-
- if (vsyscall_nr < 0) {
- warn_bad_vsyscall(KERN_WARNING, regs,
- "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
- goto sigsegv;
- }
-
- if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
- warn_bad_vsyscall(KERN_WARNING, regs,
- "vsyscall with bad stack (exploit attempt?)");
- goto sigsegv;
- }
-
- tsk = current;
-
- /*
- * Check for access_ok violations and find the syscall nr.
- *
- * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
- * 64-bit, so we don't need to special-case it here. For all the
- * vsyscalls, NULL means "don't write anything" not "write it at
- * address 0".
- */
- switch (vsyscall_nr) {
- case 0:
- if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
- !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
- ret = -EFAULT;
- goto check_fault;
- }
-
- syscall_nr = __NR_gettimeofday;
- break;
-
- case 1:
- if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
- ret = -EFAULT;
- goto check_fault;
- }
-
- syscall_nr = __NR_time;
- break;
-
- case 2:
- if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
- !write_ok_or_segv(regs->si, sizeof(unsigned))) {
- ret = -EFAULT;
- goto check_fault;
- }
-
- syscall_nr = __NR_getcpu;
- break;
- }
-
- /*
- * Handle seccomp. regs->ip must be the original value.
- * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
- *
- * We could optimize the seccomp disabled case, but performance
- * here doesn't matter.
- */
- regs->orig_ax = syscall_nr;
- regs->ax = -ENOSYS;
- tmp = secure_computing();
- if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
- warn_bad_vsyscall(KERN_DEBUG, regs,
- "seccomp tried to change syscall nr or ip");
- do_exit(SIGSYS);
- }
- regs->orig_ax = -1;
- if (tmp)
- goto do_ret; /* skip requested */
-
- /*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
- */
- prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
- current_thread_info()->sig_on_uaccess_error = 1;
-
- ret = -EFAULT;
- switch (vsyscall_nr) {
- case 0:
- ret = sys_gettimeofday(
- (struct timeval __user *)regs->di,
- (struct timezone __user *)regs->si);
- break;
-
- case 1:
- ret = sys_time((time_t __user *)regs->di);
- break;
-
- case 2:
- ret = sys_getcpu((unsigned __user *)regs->di,
- (unsigned __user *)regs->si,
- NULL);
- break;
- }
-
- current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
-
-check_fault:
- if (ret == -EFAULT) {
- /* Bad news -- userspace fed a bad pointer to a vsyscall. */
- warn_bad_vsyscall(KERN_INFO, regs,
- "vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
- }
-
- regs->ax = ret;
-
-do_ret:
- /* Emulate a ret instruction. */
- regs->ip = caller;
- regs->sp += 8;
- return true;
-
-sigsegv:
- force_sig(SIGSEGV, current);
- return true;
-}
-
-/*
- * A pseudo VMA to allow ptrace access for the vsyscall page. This only
- * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
- * not need special handling anymore:
- */
-static const char *gate_vma_name(struct vm_area_struct *vma)
-{
- return "[vsyscall]";
-}
-static struct vm_operations_struct gate_vma_ops = {
- .name = gate_vma_name,
-};
-static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_ADDR,
- .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
- .vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC,
- .vm_ops = &gate_vma_ops,
-};
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (!mm || mm->context.ia32_compat)
- return NULL;
-#endif
- if (vsyscall_mode == NONE)
- return NULL;
- return &gate_vma;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma = get_gate_vma(mm);
-
- if (!vma)
- return 0;
-
- return (addr >= vma->vm_start) && (addr < vma->vm_end);
-}
-
-/*
- * Use this when you have no reliable mm, typically from interrupt
- * context. It is less reliable than using a task's mm and may give
- * false positives.
- */
-int in_gate_area_no_mm(unsigned long addr)
-{
- return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
-}
-
-void __init map_vsyscall(void)
-{
- extern char __vsyscall_page;
- unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
-
- if (vsyscall_mode != NONE)
- __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
- vsyscall_mode == NATIVE
- ? PAGE_KERNEL_VSYSCALL
- : PAGE_KERNEL_VVAR);
-
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
- (unsigned long)VSYSCALL_ADDR);
-}
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
deleted file mode 100644
index c9596a9af159..000000000000
--- a/arch/x86/kernel/vsyscall_emu_64.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * vsyscall_emu_64.S: Vsyscall emulation page
- *
- * Copyright (c) 2011 Andy Lutomirski
- *
- * Subject to the GNU General Public License, version 2
- */
-
-#include <linux/linkage.h>
-
-#include <asm/irq_vectors.h>
-#include <asm/page_types.h>
-#include <asm/unistd_64.h>
-
-__PAGE_ALIGNED_DATA
- .globl __vsyscall_page
- .balign PAGE_SIZE, 0xcc
- .type __vsyscall_page, @object
-__vsyscall_page:
-
- mov $__NR_gettimeofday, %rax
- syscall
- ret
-
- .balign 1024, 0xcc
- mov $__NR_time, %rax
- syscall
- ret
-
- .balign 1024, 0xcc
- mov $__NR_getcpu, %rax
- syscall
- ret
-
- .balign 4096, 0xcc
-
- .size __vsyscall_page, 4096
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
deleted file mode 100644
index 51e330416995..000000000000
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright 2003 Andi Kleen, SuSE Labs.
- *
- * Modified for x86 32 bit architecture by
- * Stefani Seibold <stefani@seibold.net>
- * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
- *
- * Thanks to hpa@transmeta.com for some useful hint.
- * Special thanks to Ingo Molnar for his early experience with
- * a different vsyscall implementation for Linux/IA32 and for the name.
- *
- */
-
-#include <linux/timekeeper_internal.h>
-#include <asm/vgtod.h>
-#include <asm/vvar.h>
-
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
-
-void update_vsyscall_tz(void)
-{
- vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
- vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
- struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
- gtod_write_begin(vdata);
-
- /* copy vsyscall data */
- vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
- vdata->cycle_last = tk->tkr_mono.cycle_last;
- vdata->mask = tk->tkr_mono.mask;
- vdata->mult = tk->tkr_mono.mult;
- vdata->shift = tk->tkr_mono.shift;
-
- vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
-
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
- + ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr_mono.shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
- vdata->monotonic_time_sec++;
- }
-
- vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
- tk->tkr_mono.shift);
-
- vdata->monotonic_time_coarse_sec =
- vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_coarse_nsec =
- vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
-
- while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
- vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
- vdata->monotonic_time_coarse_sec++;
- }
-
- gtod_write_end(vdata);
-}
diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h
deleted file mode 100644
index a8b2edec54fe..000000000000
--- a/arch/x86/kernel/vsyscall_trace.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM vsyscall
-
-#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define __VSYSCALL_TRACE_H
-
-#include <linux/tracepoint.h>
-
-TRACE_EVENT(emulate_vsyscall,
-
- TP_PROTO(int nr),
-
- TP_ARGS(nr),
-
- TP_STRUCT__entry(__field(int, nr)),
-
- TP_fast_assign(
- __entry->nr = nr;
- ),
-
- TP_printk("nr = %d", __entry->nr)
-);
-
-#endif
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../arch/x86/kernel
-#define TRACE_INCLUDE_FILE vsyscall_trace
-#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 234b0722de53..3cee10abf01d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform);
#if defined(CONFIG_PCI_MSI)
struct x86_msi_ops x86_msi = {
.setup_msi_irqs = native_setup_msi_irqs,
- .compose_msi_msg = native_compose_msi_msg,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
- .setup_hpet_msi = default_setup_hpet_msi,
};
/* MSI arch specific hooks */
@@ -141,13 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
#endif
struct x86_io_apic_ops x86_io_apic_ops = {
- .init = native_io_apic_init_mappings,
.read = native_io_apic_read,
- .write = native_io_apic_write,
- .modify = native_io_apic_modify,
.disable = native_disable_io_apic,
- .print_entries = native_io_apic_print_entries,
- .set_affinity = native_ioapic_set_affinity,
- .setup_entry = native_setup_ioapic_entry,
- .eoi_ioapic_pin = native_eoi_ioapic_pin,
};