From 0f4fdb7fba0b3ec66df39d0e743e701404161fb8 Mon Sep 17 00:00:00 2001
From: ravikiran thirumalai <kiran@scalex86.org>
Date: Mon, 26 Jun 2006 13:56:04 +0200
Subject: [PATCH] x86_64: Use local APIC ID from local APIC instead of CPUID

vSMPowered systems use apic_cluster too.  Forcing apic_physflat works
on these systems too, but only if we change phys_pkg_id to use
hard_smp_prcoessor_id() instead of cpuid_ebx.  I am guessing other
multichassi cluster systems would need this too.

Signed-off-by: ravikiran thirumalai <kiran@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/genapic_flat.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 1a2ab825be98..c66ca7b1d31a 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -108,10 +108,7 @@ static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
 
 static unsigned int phys_pkg_id(int index_msb)
 {
-	u32 ebx;
-
-	ebx = cpuid_ebx(1);
-	return ((ebx >> 24) & 0xFF) >> index_msb;
+	return hard_smp_processor_id() >> index_msb;
 }
 
 struct genapic apic_flat =  {
-- 
cgit v1.2.3


From faee9a5dc9d8399cc3b1b8e18b6d7ff7b17f1af1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:10 +0200
Subject: [PATCH] i386/x86-64: Use new official CPUID to get APICID/core split
 on AMD platforms

Previously the apicid<->coreid split was computed based on the max
number of cores. Now use a new CPUID AMD defined for that. On most
systems right now it should be 0 and the old method will be used.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index fb850b52b4da..1cb3e21c571a 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -873,10 +873,18 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	int node = 0;
 	unsigned apicid = hard_smp_processor_id();
 #endif
+	unsigned ecx = cpuid_ecx(0x80000008);
 
-	bits = 0;
-	while ((1 << bits) < c->x86_max_cores)
-		bits++;
+	c->x86_max_cores = (ecx & 0xff) + 1;
+
+	/* CPU telling us the core id bits shift? */
+	bits = (ecx >> 12) & 0xF;
+
+	/* Otherwise recompute */
+	if (bits == 0) {
+		while ((1 << bits) < c->x86_max_cores)
+			bits++;
+	}
 
 	/* Low order bits define the core id (index of core in socket) */
 	cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
@@ -964,11 +972,9 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1<<8))
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
 
-	if (c->extended_cpuid_level >= 0x80000008) {
-		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level >= 0x80000008)
 		amd_detect_cmp(c);
-	}
 
 	return r;
 }
-- 
cgit v1.2.3


From 240cd6a80642da528bfa382ec2ae4e3cb8991ea7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:13 +0200
Subject: [PATCH] i386/x86-64: Emulate CPUID4 on AMD

Intel systems report the cache level data from CPUID 4 in sysfs.
Add a CPUID 4 emulation for AMD CPUs to report the same
information for them. This allows programs to read this
information in a uniform way.

The AMD way to report this is less flexible so some assumptions
are hardcoded (e.g. no L3)

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 1cb3e21c571a..4b7e02216970 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -976,6 +976,9 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000008)
 		amd_detect_cmp(c);
 
+	/* Fix cpuid4 emulation for more */
+	num_cache_leaves = 3;
+
 	return r;
 }
 
-- 
cgit v1.2.3


From d167a51877e94dda73dd656c51f363502309f713 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@suse.de>
Date: Mon, 26 Jun 2006 13:56:16 +0200
Subject: [PATCH] x86_64: x86_64 version of the smp alternative patch.

Changes are largely identical to the i386 version:

 * alternative #define are moved to the new alternative.h file.
 * one new elf section with pointers to the lock prefixes which can be
   nop'ed out for non-smp.
 * two new elf sections simliar to the "classic" alternatives to
   replace SMP code with simpler UP code.
 * fixup headers to use alternative.h instead of defining their own
   LOCK / LOCK_PREFIX macros.

The patch reuses the i386 version of the alternatives code to avoid code
duplication.  The code in alternatives.c was shuffled around a bit to
reduce the number of #ifdefs needed.  It also got some tweaks needed for
x86_64 (vsyscall page handling) and new features (noreplacement option
which was x86_64 only up to now).  Debug printk's are changed from
compile-time to runtime.

Loosely based on a early version from Bastian Blank <waldi@debian.org>

Signed-off-by: Gerd Hoffmann <kraxel@suse.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile      |  4 ++-
 arch/x86_64/kernel/module.c      | 38 +++++++++++++-------
 arch/x86_64/kernel/setup.c       | 76 +---------------------------------------
 arch/x86_64/kernel/smpboot.c     |  4 +++
 arch/x86_64/kernel/vmlinux.lds.S | 20 +++++++++++
 5 files changed, 53 insertions(+), 89 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 059c88313f4e..381bc6ad743e 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		pci-dma.o pci-nommu.o
+		pci-dma.o pci-nommu.o alternative.o
 
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
@@ -49,3 +49,5 @@ intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
 i8237-y				+= ../../i386/kernel/i8237.o
 msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
+alternative-y			+= ../../i386/kernel/alternative.o
+
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
index bac195c74bcc..9d0958ff547f 100644
--- a/arch/x86_64/kernel/module.c
+++ b/arch/x86_64/kernel/module.c
@@ -145,26 +145,38 @@ int apply_relocate(Elf_Shdr *sechdrs,
 	return -ENOSYS;
 } 
 
-extern void apply_alternatives(void *start, void *end); 
-
 int module_finalize(const Elf_Ehdr *hdr,
-		    const Elf_Shdr *sechdrs,
-		    struct module *me)
+                    const Elf_Shdr *sechdrs,
+                    struct module *me)
 {
-	const Elf_Shdr *s;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
-	/* look for .altinstructions to patch */ 
-	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
-		void *seg; 		
-		if (strcmp(".altinstructions", secstrings + s->sh_name))
-			continue;
-		seg = (void *)s->sh_addr; 
-		apply_alternatives(seg, seg + s->sh_size); 
-	} 	
+	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+		if (!strcmp(".text", secstrings + s->sh_name))
+			text = s;
+		if (!strcmp(".altinstructions", secstrings + s->sh_name))
+			alt = s;
+		if (!strcmp(".smp_locks", secstrings + s->sh_name))
+			locks= s;
+	}
+
+	if (alt) {
+		/* patch .altinstructions */
+		void *aseg = (void *)alt->sh_addr;
+		apply_alternatives(aseg, aseg + alt->sh_size);
+	}
+	if (locks && text) {
+		void *lseg = (void *)locks->sh_addr;
+		void *tseg = (void *)text->sh_addr;
+		alternatives_smp_module_add(me, me->name,
+					    lseg, lseg + locks->sh_size,
+					    tseg, tseg + text->sh_size);
+	}
 	return 0;
 }
 
 void module_arch_cleanup(struct module *mod)
 {
+	alternatives_smp_module_del(mod);
 }
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 4b7e02216970..64640c8f5eed 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -473,80 +473,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 } 
 #endif
 
-/* Use inline assembly to define this because the nops are defined 
-   as inline assembly strings in the include files and we cannot 
-   get them easily into strings. */
-asm("\t.data\nk8nops: " 
-    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
-    K8_NOP7 K8_NOP8); 
-    
-extern unsigned char k8nops[];
-static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
-     NULL,
-     k8nops,
-     k8nops + 1,
-     k8nops + 1 + 2,
-     k8nops + 1 + 2 + 3,
-     k8nops + 1 + 2 + 3 + 4,
-     k8nops + 1 + 2 + 3 + 4 + 5,
-     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
-     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-}; 
-
-extern char __vsyscall_0;
-
-/* Replace instructions with better alternatives for this CPU type.
-
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that assymetric systems where
-   APs have less capabilities than the boot processor are not handled. 
-   In this case boot with "noreplacement". */ 
-void apply_alternatives(void *start, void *end) 
-{ 
-	struct alt_instr *a; 
-	int diff, i, k;
-	for (a = start; (void *)a < end; a++) { 
-		u8 *instr;
-
-		if (!boot_cpu_has(a->cpuid))
-			continue;
-
-		BUG_ON(a->replacementlen > a->instrlen); 
-		instr = a->instr;
-		/* vsyscall code is not mapped yet. resolve it manually. */
-		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
-			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
-		__inline_memcpy(instr, a->replacement, a->replacementlen);
-		diff = a->instrlen - a->replacementlen; 
-
-		/* Pad the rest with nops */
-		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
-			k = diff;
-			if (k > ASM_NOP_MAX)
-				k = ASM_NOP_MAX;
-			__inline_memcpy(instr + i, k8_nops[k], k);
-		} 
-	}
-} 
-
-static int no_replacement __initdata = 0; 
- 
-void __init alternative_instructions(void)
-{
-	extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-	if (no_replacement) 
-		return;
-	apply_alternatives(__alt_instructions, __alt_instructions_end);
-}
-
-static int __init noreplacement_setup(char *s)
-{ 
-     no_replacement = 1; 
-     return 1;
-} 
-
-__setup("noreplacement", noreplacement_setup); 
-
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
 #ifdef CONFIG_EDD_MODULE
@@ -1303,7 +1229,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		/* Other (Linux-defined) */
 		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
 		"constant_tsc", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 71a7222cf9ce..06535e7687ce 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -797,6 +797,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	}
 
 
+	alternatives_smp_switch(1);
+
 	c_idle.idle = get_idle_for_cpu(cpu);
 
 	if (c_idle.idle) {
@@ -1259,6 +1261,8 @@ void __cpu_die(unsigned int cpu)
 		/* They ack this in play_dead by setting CPU_DEAD */
 		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
 			printk ("CPU %d is now offline\n", cpu);
+			if (1 == num_online_cpus())
+				alternatives_smp_switch(0);
 			return;
 		}
 		msleep(100);
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index b81f473c4a19..5968c2415da9 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -131,6 +131,26 @@ SECTIONS
 	*(.data.page_aligned)
   }
 
+  /* might get freed after init */
+  . = ALIGN(4096);
+  __smp_alt_begin = .;
+  __smp_alt_instructions = .;
+  .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
+	*(.smp_altinstructions)
+  }
+  __smp_alt_instructions_end = .;
+  . = ALIGN(8);
+  __smp_locks = .;
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+	*(.smp_locks)
+  }
+  __smp_locks_end = .;
+  .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
+	*(.smp_altinstr_replacement)
+  }
+  . = ALIGN(4096);
+  __smp_alt_end = .;
+
   . = ALIGN(4096);		/* Init code and data */
   __init_begin = .;
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-- 
cgit v1.2.3


From 9f2036f3e280ef1df072c5b2ca115a0e766c1d10 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:56:19 +0200
Subject: [PATCH] x86_64: pci-dma.c clean-up - trivial

Replace hard coded DMA masks with #defines from
include/linux/dma-mapping.h

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-dma.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index a9275c9557cf..117bcebe9ebc 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -38,7 +38,7 @@ int force_iommu __read_mostly= 0;
    to i386. */
 struct device fallback_dev = {
 	.bus_id = "fallback device",
-	.coherent_dma_mask = 0xffffffff,
+	.coherent_dma_mask = DMA_32BIT_MASK,
 	.dma_mask = &fallback_dev.coherent_dma_mask,
 };
 
@@ -77,7 +77,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		dev = &fallback_dev;
 	dma_mask = dev->coherent_dma_mask;
 	if (dma_mask == 0)
-		dma_mask = 0xffffffff;
+		dma_mask = DMA_32BIT_MASK;
 
 	/* Don't invoke OOM killer */
 	gfp |= __GFP_NORETRY;
@@ -90,7 +90,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	   larger than 16MB and in this case we have a chance of
 	   finding fitting memory in the next higher zone first. If
 	   not retry with true GFP_DMA. -AK */
-	if (dma_mask <= 0xffffffff)
+	if (dma_mask <= DMA_32BIT_MASK)
 		gfp |= GFP_DMA32;
 
  again:
@@ -111,7 +111,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 			/* Don't use the 16MB ZONE_DMA unless absolutely
 			   needed. It's better to use remapping first. */
-			if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) {
+			if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
 				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
 				goto again;
 			}
@@ -174,7 +174,7 @@ int dma_supported(struct device *dev, u64 mask)
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
 	   The caller just has to use GFP_DMA in this case. */
-        if (mask < 0x00ffffff)
+        if (mask < DMA_24BIT_MASK)
                 return 0;
 
 	/* Tell the device to use SAC when IOMMU force is on.  This
@@ -189,7 +189,7 @@ int dma_supported(struct device *dev, u64 mask)
 	   SAC for these.  Assume all masks <= 40 bits are of this
 	   type. Normally this doesn't make any difference, but gives
 	   more gentle handling of IOMMU overflow. */
-	if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
+	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
 		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
 		return 0;
 	}
-- 
cgit v1.2.3


From f5adc9c79d3a15478c0028139c54453ff3900488 Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Mon, 26 Jun 2006 13:56:31 +0200
Subject: [PATCH] x86_64: iommu_gart_bitmap search to cross next_bit

Allow search for a contiguous block of iommu space to cross the next_bit
marker if we have already committed ourselves to flushing the gart.

There shouldn't be any reason why we'd restrict the search.

Signed-off-by: Mike Waychison <mikew@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-gart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 82a7c9bfdfa0..e0198da7451d 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -93,7 +93,7 @@ static unsigned long alloc_iommu(int size)
 	offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
 	if (offset == -1) {
 		need_flush = 1;
-	       	offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
+		offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
 	}
 	if (offset != -1) { 
 		set_bit_string(iommu_gart_bitmap, offset, size); 
-- 
cgit v1.2.3


From 7c2d9cd218916276e52a5dae827b84a159fe5c96 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:56:37 +0200
Subject: [PATCH] x86_64: trivial gart clean-up

A trivial change to have gart_unmap_sg call gart_unmap_single directly,
instead of bouncing through the dma_unmap_single wrapper in
dma-mapping.h.

This change required moving the gart_unmap_single above gart_unmap_sg,
and under gart_map_single (which seems a more logical place that its
current location IMHO).

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-gart.c | 46 +++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index e0198da7451d..ea8f4041794e 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -288,6 +288,28 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
 	return bus; 
 }
 
+/*
+ * Free a DMA mapping.
+ */
+void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+		      size_t size, int direction)
+{
+	unsigned long iommu_page;
+	int npages;
+	int i;
+
+	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+	    dma_addr >= iommu_bus_base + iommu_size)
+		return;
+	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+	npages = to_pages(dma_addr, size);
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+		CLEAR_LEAK(iommu_page + i);
+	}
+	free_iommu(iommu_page, npages);
+}
+
 /*
  * Wrapper for pci_unmap_single working with scatterlists.
  */
@@ -299,7 +321,7 @@ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int di
 		struct scatterlist *s = &sg[i];
 		if (!s->dma_length || !s->length)
 			break;
-		dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
+		gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
 	}
 }
 
@@ -458,28 +480,6 @@ error:
 	return 0;
 } 
 
-/*
- * Free a DMA mapping.
- */ 
-void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
-		      size_t size, int direction)
-{
-	unsigned long iommu_page; 
-	int npages;
-	int i;
-
-	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 
-	    dma_addr >= iommu_bus_base + iommu_size)
-		return;
-	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;	
-	npages = to_pages(dma_addr, size);
-	for (i = 0; i < npages; i++) { 
-		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 
-		CLEAR_LEAK(iommu_page + i);
-	}
-	free_iommu(iommu_page, npages);
-}
-
 static int no_agp;
 
 static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
-- 
cgit v1.2.3


From a32073bffc656ca4bde6002b6cf7c1a8e0e22712 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:40 +0200
Subject: [PATCH] x86_64: Clean and enhance up K8 northbridge access code

 - Factor out the duplicated access/cache code into a single file
   * Shared between i386/x86-64.
 - Share flush code between AGP and IOMMU
   * Fix a bug: AGP didn't wait for end of flush before
 - Drop 8 northbridges limit and allocate dynamically
 - Add lock to serialize AGP and IOMMU GART flushes
 - Add PCI ID for next AMD northbridge
 - Random related cleanups

The old K8 NUMA discovery code is unchanged. New systems
should all use SRAT for this.

Cc: "Navin Boppuri" <navin.boppuri@newisys.com>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile   |   1 +
 arch/x86_64/kernel/aperture.c |  24 ++++-----
 arch/x86_64/kernel/k8.c       | 118 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/pci-gart.c |  93 ++++++++-------------------------
 4 files changed, 152 insertions(+), 84 deletions(-)
 create mode 100644 arch/x86_64/kernel/k8.c

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 381bc6ad743e..f927d11065fe 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 obj-$(CONFIG_X86_VSMP)		+= vsmp.o
+obj-$(CONFIG_K8_NB)		+= k8.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 70b9d21ed675..a7ad03ee98cf 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -24,6 +24,7 @@
 #include <asm/proto.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
+#include <asm/k8.h>
 
 int iommu_aperture;
 int iommu_aperture_disabled __initdata = 0;
@@ -37,8 +38,6 @@ int fix_aperture __initdata = 1;
 /* This code runs before the PCI subsystem is initialized, so just
    access the northbridge directly. */
 
-#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
-
 static u32 __init allocate_aperture(void) 
 {
 	pg_data_t *nd0 = NODE_DATA(0);
@@ -68,20 +67,20 @@ static u32 __init allocate_aperture(void)
 	return (u32)__pa(p); 
 }
 
-static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) 
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
 { 
 	if (!aper_base) 
 		return 0;
 	if (aper_size < 64*1024*1024) { 
-		printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); 
+		printk("Aperture too small (%d MB)\n", aper_size>>20);
 		return 0;
 	}
 	if (aper_base + aper_size >= 0xffffffff) { 
-		printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+		printk("Aperture beyond 4GB. Ignoring.\n");
 		return 0; 
 	}
 	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-		printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
+		printk("Aperture pointing to e820 RAM. Ignoring.\n");
 		return 0; 
 	} 
 	return 1;
@@ -140,7 +139,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
 	       aper, 32 << *order, apsizereg);
 
-	if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
+	if (!aperture_valid(aper, (32*1024*1024) << *order))
 	    return 0;
 	return (u32)aper; 
 } 
@@ -208,9 +207,8 @@ void __init iommu_hole_init(void)
 
 	fix = 0;
 	for (num = 24; num < 32; num++) {		
-		char name[30];
-		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
-			continue;	
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;
 
 		iommu_aperture = 1; 
 
@@ -222,9 +220,7 @@ void __init iommu_hole_init(void)
 		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
 		       aper_base, aper_size>>20);
 		
-		sprintf(name, "northbridge cpu %d", num-24); 
-
-		if (!aperture_valid(name, aper_base, aper_size)) { 
+		if (!aperture_valid(aper_base, aper_size)) {
 			fix = 1; 
 			break; 
 		}
@@ -273,7 +269,7 @@ void __init iommu_hole_init(void)
 
 	/* Fix up the north bridges */
 	for (num = 24; num < 32; num++) { 		
-		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
 			continue;	
 
 		/* Don't enable translation yet. That is done later. 
diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c
new file mode 100644
index 000000000000..6416682d33d0
--- /dev/null
+++ b/arch/x86_64/kernel/k8.c
@@ -0,0 +1,118 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/k8.h>
+
+int num_k8_northbridges;
+EXPORT_SYMBOL(num_k8_northbridges);
+
+static u32 *flush_words;
+
+struct pci_device_id k8_nb_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+	{}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+
+struct pci_dev **k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+	do {
+		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+		if (!dev)
+			break;
+	} while (!pci_match_id(&k8_nb_ids[0], dev));
+	return dev;
+}
+
+int cache_k8_northbridges(void)
+{
+	int i;
+	struct pci_dev *dev;
+	if (num_k8_northbridges)
+		return 0;
+
+	num_k8_northbridges = 0;
+	dev = NULL;
+	while ((dev = next_k8_northbridge(dev)) != NULL)
+		num_k8_northbridges++;
+
+	k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
+				  GFP_KERNEL);
+	if (!k8_northbridges)
+		return -ENOMEM;
+
+	flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
+	if (!flush_words) {
+		kfree(k8_northbridges);
+		return -ENOMEM;
+	}
+
+	dev = NULL;
+	i = 0;
+	while ((dev = next_k8_northbridge(dev)) != NULL) {
+		k8_northbridges[i++] = dev;
+		pci_read_config_dword(dev, 0x9c, &flush_words[i]);
+	}
+	k8_northbridges[i] = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+
+/* Ignores subdevice/subvendor but as far as I can figure out
+   they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+	struct pci_device_id *id;
+	u32 vendor = device & 0xffff;
+	device >>= 16;
+	for (id = k8_nb_ids; id->vendor; id++)
+		if (vendor == id->vendor && device == id->device)
+			return 1;
+	return 0;
+}
+
+void k8_flush_garts(void)
+{
+	int flushed, i;
+	unsigned long flags;
+	static DEFINE_SPINLOCK(gart_lock);
+
+	/* Avoid races between AGP and IOMMU. In theory it's not needed
+	   but I'm not sure if the hardware won't lose flush requests
+	   when another is pending. This whole thing is so expensive anyways
+	   that it doesn't matter to serialize more. -AK */
+	spin_lock_irqsave(&gart_lock, flags);
+	flushed = 0;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		pci_write_config_dword(k8_northbridges[i], 0x9c,
+				       flush_words[i]|1);
+		flushed++;
+	}
+	for (i = 0; i < num_k8_northbridges; i++) {
+		u32 w;
+		/* Make sure the hardware actually executed the flush*/
+		for (;;) {
+			pci_read_config_dword(k8_northbridges[i],
+					      0x9c, &w);
+			if (!(w & 1))
+				break;
+			cpu_relax();
+		}
+	}
+	spin_unlock_irqrestore(&gart_lock, flags);
+	if (!flushed)
+		printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
+
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index ea8f4041794e..ded3af3bceec 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -32,6 +32,7 @@
 #include <asm/kdebug.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
+#include <asm/k8.h>
 
 unsigned long iommu_bus_base;	/* GART remapping area (physical) */
 static unsigned long iommu_size; 	/* size of remapping area bytes */
@@ -46,8 +47,6 @@ u32 *iommu_gatt_base; 		/* Remapping table */
    also seen with Qlogic at least). */
 int iommu_fullflush = 1;
 
-#define MAX_NB 8
-
 /* Allocation bitmap for the remapping area */ 
 static DEFINE_SPINLOCK(iommu_bitmap_lock);
 static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
@@ -63,13 +62,6 @@ static u32 gart_unmapped_entry;
 #define to_pages(addr,size) \
 	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
 
-#define for_all_nb(dev) \
-	dev = NULL;	\
-	while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)
-
-static struct pci_dev *northbridges[MAX_NB];
-static u32 northbridge_flush_word[MAX_NB];
-
 #define EMERGENCY_PAGES 32 /* = 128KB */ 
 
 #ifdef CONFIG_AGP
@@ -120,44 +112,17 @@ static void free_iommu(unsigned long offset, int size)
 /* 
  * Use global flush state to avoid races with multiple flushers.
  */
-static void flush_gart(struct device *dev)
+static void flush_gart(void)
 { 
 	unsigned long flags;
-	int flushed = 0;
-	int i, max;
-
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	if (need_flush) { 
-		max = 0;
-		for (i = 0; i < MAX_NB; i++) {
-			if (!northbridges[i]) 
-				continue;
-			pci_write_config_dword(northbridges[i], 0x9c, 
-					       northbridge_flush_word[i] | 1); 
-			flushed++;
-			max = i;
-		}
-		for (i = 0; i <= max; i++) {
-			u32 w;
-			if (!northbridges[i])
-				continue;
-			/* Make sure the hardware actually executed the flush. */
-			for (;;) { 
-				pci_read_config_dword(northbridges[i], 0x9c, &w);
-				if (!(w & 1))
-					break;
-				cpu_relax();
-			}
-		} 
-		if (!flushed) 
-			printk("nothing to flush?\n");
+	if (need_flush) {
+		k8_flush_garts();
 		need_flush = 0;
 	} 
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
 } 
 
-
-
 #ifdef CONFIG_IOMMU_LEAK
 
 #define SET_LEAK(x) if (iommu_leak_tab) \
@@ -266,7 +231,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf,
 				 size_t size, int dir)
 {
 	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
-	flush_gart(dev);
+	flush_gart();
 	return map;
 }
 
@@ -351,7 +316,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 		s->dma_address = addr;
 		s->dma_length = s->length;
 	}
-	flush_gart(dev);
+	flush_gart();
 	return nents;
 }
 
@@ -458,13 +423,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
 		goto error;
 	out++;
-	flush_gart(dev);
+	flush_gart();
 	if (out < nents) 
 		sg[out].dma_length = 0; 
 	return out;
 
 error:
-	flush_gart(NULL);
+	flush_gart();
 	gart_unmap_sg(dev, sg, nents, dir);
 	/* When it was forced or merged try again in a dumb way */
 	if (force_iommu || iommu_merge) {
@@ -532,10 +497,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	void *gatt;
 	unsigned aper_base, new_aper_base;
 	unsigned aper_size, gatt_size, new_aper_size;
-	
+	int i;
+
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
-	for_all_nb(dev) { 
+	dev = NULL;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		dev = k8_northbridges[i];
 		new_aper_base = read_aperture(dev, &new_aper_size); 
 		if (!new_aper_base) 
 			goto nommu; 
@@ -558,11 +526,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 		panic("Cannot allocate GATT table"); 
 	memset(gatt, 0, gatt_size); 
 	agp_gatt_table = gatt;
-	
-	for_all_nb(dev) { 
+
+	for (i = 0; i < num_k8_northbridges; i++) {
 		u32 ctl; 
 		u32 gatt_reg; 
 
+		dev = k8_northbridges[i];
 		gatt_reg = __pa(gatt) >> 12; 
 		gatt_reg <<= 4; 
 		pci_write_config_dword(dev, 0x98, gatt_reg);
@@ -573,7 +542,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 		pci_write_config_dword(dev, 0x90, ctl); 
 	}
-	flush_gart(NULL); 
+	flush_gart();
 	
 	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
 	return 0;
@@ -607,10 +576,14 @@ static int __init pci_iommu_init(void)
 	struct agp_kern_info info;
 	unsigned long aper_size;
 	unsigned long iommu_start;
-	struct pci_dev *dev;
 	unsigned long scratch;
 	long i;
 
+	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+		return -1;
+	}
+
 #ifndef CONFIG_AGP_AMD64
 	no_agp = 1; 
 #else
@@ -637,14 +610,6 @@ static int __init pci_iommu_init(void)
 		return -1;
 	}
 
-	i = 0;
-	for_all_nb(dev)
-		i++;
-	if (i > MAX_NB) {
-		printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i);
-		return -1;
-	}
-
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
 	aper_size = info.aper_size * 1024 * 1024;	
 	iommu_size = check_iommu_size(info.aper_base, aper_size); 
@@ -707,20 +672,8 @@ static int __init pci_iommu_init(void)
 	for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
 		iommu_gatt_base[i] = gart_unmapped_entry;
 
-	for_all_nb(dev) {
-		u32 flag; 
-		int cpu = PCI_SLOT(dev->devfn) - 24;
-		if (cpu >= MAX_NB)
-			continue;
-		northbridges[cpu] = dev;
-		pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
-		northbridge_flush_word[cpu] = flag; 
-	}
-		     
-	flush_gart(NULL);
-
+	flush_gart();
 	dma_ops = &gart_dma_ops;
-
 	return 0;
 } 
 
-- 
cgit v1.2.3


From 0a1ad60d7a7eb433095bc1b2c8b475f3f278f61d Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:56:43 +0200
Subject: [PATCH] x86_64: serialize assign_irq_vector() use of static variables

Since assign_irq_vector() can be called at runtime, its access of static
variables should be protected by a lock.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 9cc7031b7151..062b10daebac 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -56,6 +56,7 @@ int timer_over_8254 __initdata = 0;
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -834,10 +835,16 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 int assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+	int vector;
 
 	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
-	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
+
+	spin_lock(&vector_lock);
+
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
+		spin_unlock(&vector_lock);
 		return IO_APIC_VECTOR(irq);
+	}
 next:
 	current_vector += 8;
 	if (current_vector == IA32_SYSCALL_VECTOR)
@@ -849,11 +856,14 @@ next:
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	vector_irq[current_vector] = irq;
+	vector = current_vector;
+	vector_irq[vector] = irq;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vector;
+
+	spin_unlock(&vector_lock);
 
-	return current_vector;
+	return vector;
 }
 
 extern void (*interrupt[NR_IRQS])(void);
-- 
cgit v1.2.3


From 6ebcc00e95fa78218a048a88d8d5ee491aa7d6d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:56:46 +0200
Subject: [PATCH] i386/x86-64: simplify ioapic_register_intr()

Simplify (remove duplication of) code in ioapic_register_intr().

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 062b10daebac..15b2f6932de1 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -876,21 +876,14 @@ static struct hw_interrupt_type ioapic_edge_type;
 
 static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	if (use_pci_vector() && !platform_legacy_irq(irq)) {
-		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-				trigger == IOAPIC_LEVEL)
-			irq_desc[vector].handler = &ioapic_level_type;
-		else
-			irq_desc[vector].handler = &ioapic_edge_type;
-		set_intr_gate(vector, interrupt[vector]);
-	} else	{
-		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-				trigger == IOAPIC_LEVEL)
-			irq_desc[irq].handler = &ioapic_level_type;
-		else
-			irq_desc[irq].handler = &ioapic_edge_type;
-		set_intr_gate(vector, interrupt[irq]);
-	}
+	unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
+
+	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+			trigger == IOAPIC_LEVEL)
+		irq_desc[idx].handler = &ioapic_level_type;
+	else
+		irq_desc[idx].handler = &ioapic_edge_type;
+	set_intr_gate(vector, interrupt[idx]);
 }
 
 static void __init setup_IO_APIC_irqs(void)
-- 
cgit v1.2.3


From f201611fcecdfa825471dc425ee007997228fae4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:49 +0200
Subject: [PATCH] x86_64: Use -ENODEV in IOMMU initialization

Fix

initcall at 0xffffffff806c5b89: pci_iommu_init+0x0/0x53c(): returned with error code -1

Return -ENODEV instead when the IOMMU is not used.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-gart.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index ded3af3bceec..82a346e6e2e4 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -581,7 +581,7 @@ static int __init pci_iommu_init(void)
 
 	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
 		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
-		return -1;
+		return -ENODEV;
 	}
 
 #ifndef CONFIG_AGP_AMD64
@@ -595,7 +595,7 @@ static int __init pci_iommu_init(void)
 #endif	
 
 	if (swiotlb)
-		return -1; 
+		return -ENODEV;
 
 	if (no_iommu ||
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
@@ -607,7 +607,7 @@ static int __init pci_iommu_init(void)
 					"but IOMMU not available.\n"
 			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
 		}
-		return -1;
+		return -ENODEV;
 	}
 
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
-- 
cgit v1.2.3


From 4b787e0b831c71c6b09902b66575dadb2260a7c8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:56:55 +0200
Subject: [PATCH] x86_64: add END()/ENDPROC() annotations to entry.S

Since END()/ENDPROC() are now available, add respective annotations to
x86_64's entry.S. This should help debugging activities.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 586b34c00c48..9999d703b6c4 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -154,6 +154,7 @@ rff_trace:
 	GET_THREAD_INFO(%rcx)	
 	jmp rff_action
 	CFI_ENDPROC
+END(ret_from_fork)
 
 /*
  * System call entry. Upto 6 arguments in registers are supported.
@@ -285,6 +286,7 @@ tracesys:
 	/* Use IRET because user could have changed frame */
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
+END(system_call)
 		
 /* 
  * Syscall return path ending with IRET.
@@ -364,6 +366,7 @@ int_restore_rest:
 	cli
 	jmp int_with_check
 	CFI_ENDPROC
+END(int_ret_from_sys_call)
 		
 /* 
  * Certain special system calls that need to save a complete full stack frame.
@@ -375,6 +378,7 @@ int_restore_rest:
 	leaq	\func(%rip),%rax
 	leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
 	jmp	ptregscall_common
+END(\label)
 	.endm
 
 	CFI_STARTPROC
@@ -404,6 +408,7 @@ ENTRY(ptregscall_common)
 	CFI_REL_OFFSET rip, 0
 	ret
 	CFI_ENDPROC
+END(ptregscall_common)
 	
 ENTRY(stub_execve)
 	CFI_STARTPROC
@@ -418,6 +423,7 @@ ENTRY(stub_execve)
 	RESTORE_REST
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
+END(stub_execve)
 	
 /*
  * sigreturn is special because it needs to restore all registers on return.
@@ -435,6 +441,7 @@ ENTRY(stub_rt_sigreturn)
 	RESTORE_REST
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
+END(stub_rt_sigreturn)
 
 /*
  * initial frame state for interrupts and exceptions
@@ -589,7 +596,9 @@ retint_kernel:
 	call preempt_schedule_irq
 	jmp exit_intr
 #endif	
+
 	CFI_ENDPROC
+END(common_interrupt)
 	
 /*
  * APIC interrupts.
@@ -605,17 +614,21 @@ retint_kernel:
 
 ENTRY(thermal_interrupt)
 	apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+END(thermal_interrupt)
 
 ENTRY(threshold_interrupt)
 	apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+END(threshold_interrupt)
 
 #ifdef CONFIG_SMP	
 ENTRY(reschedule_interrupt)
 	apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+END(reschedule_interrupt)
 
 	.macro INVALIDATE_ENTRY num
 ENTRY(invalidate_interrupt\num)
 	apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt	
+END(invalidate_interrupt\num)
 	.endm
 
 	INVALIDATE_ENTRY 0
@@ -629,17 +642,21 @@ ENTRY(invalidate_interrupt\num)
 
 ENTRY(call_function_interrupt)
 	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+END(call_function_interrupt)
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC	
 ENTRY(apic_timer_interrupt)
 	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+END(apic_timer_interrupt)
 
 ENTRY(error_interrupt)
 	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+END(error_interrupt)
 
 ENTRY(spurious_interrupt)
 	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+END(spurious_interrupt)
 #endif
 				
 /*
@@ -777,6 +794,7 @@ error_kernelspace:
 	cmpq $gs_change,RIP(%rsp)
         je   error_swapgs
 	jmp  error_sti
+END(error_entry)
 	
        /* Reload gs selector with exception handling */
        /* edi:  new selector */ 
@@ -794,6 +812,7 @@ gs_change:
 	CFI_ADJUST_CFA_OFFSET -8
         ret
 	CFI_ENDPROC
+ENDPROC(load_gs_index)
        
         .section __ex_table,"a"
         .align 8
@@ -847,7 +866,7 @@ ENTRY(kernel_thread)
 	UNFAKE_STACK_FRAME
 	ret
 	CFI_ENDPROC
-
+ENDPROC(kernel_thread)
 	
 child_rip:
 	/*
@@ -860,6 +879,7 @@ child_rip:
 	# exit
 	xorl %edi, %edi
 	call do_exit
+ENDPROC(child_rip)
 
 /*
  * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -889,19 +909,24 @@ ENTRY(execve)
 	UNFAKE_STACK_FRAME
 	ret
 	CFI_ENDPROC
+ENDPROC(execve)
 
 KPROBE_ENTRY(page_fault)
 	errorentry do_page_fault
+END(page_fault)
 	.previous .text
 
 ENTRY(coprocessor_error)
 	zeroentry do_coprocessor_error
+END(coprocessor_error)
 
 ENTRY(simd_coprocessor_error)
 	zeroentry do_simd_coprocessor_error	
+END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
 	zeroentry math_state_restore
+END(device_not_available)
 
 	/* runs on exception stack */
 KPROBE_ENTRY(debug)
@@ -911,6 +936,7 @@ KPROBE_ENTRY(debug)
 	paranoidentry do_debug, DEBUG_STACK
 	jmp paranoid_exit
 	CFI_ENDPROC
+END(debug)
 	.previous .text
 
 	/* runs on exception stack */	
@@ -961,6 +987,7 @@ paranoid_schedule:
 	cli
 	jmp paranoid_userspace
 	CFI_ENDPROC
+END(nmi)
 	.previous .text
 
 KPROBE_ENTRY(int3)
@@ -970,22 +997,28 @@ KPROBE_ENTRY(int3)
  	paranoidentry do_int3, DEBUG_STACK
  	jmp paranoid_exit
  	CFI_ENDPROC
+END(int3)
 	.previous .text
 
 ENTRY(overflow)
 	zeroentry do_overflow
+END(overflow)
 
 ENTRY(bounds)
 	zeroentry do_bounds
+END(bounds)
 
 ENTRY(invalid_op)
 	zeroentry do_invalid_op	
+END(invalid_op)
 
 ENTRY(coprocessor_segment_overrun)
 	zeroentry do_coprocessor_segment_overrun
+END(coprocessor_segment_overrun)
 
 ENTRY(reserved)
 	zeroentry do_reserved
+END(reserved)
 
 	/* runs on exception stack */
 ENTRY(double_fault)
@@ -993,12 +1026,15 @@ ENTRY(double_fault)
 	paranoidentry do_double_fault
 	jmp paranoid_exit
 	CFI_ENDPROC
+END(double_fault)
 
 ENTRY(invalid_TSS)
 	errorentry do_invalid_TSS
+END(invalid_TSS)
 
 ENTRY(segment_not_present)
 	errorentry do_segment_not_present
+END(segment_not_present)
 
 	/* runs on exception stack */
 ENTRY(stack_segment)
@@ -1006,19 +1042,24 @@ ENTRY(stack_segment)
 	paranoidentry do_stack_segment
 	jmp paranoid_exit
 	CFI_ENDPROC
+END(stack_segment)
 
 KPROBE_ENTRY(general_protection)
 	errorentry do_general_protection
+END(general_protection)
 	.previous .text
 
 ENTRY(alignment_check)
 	errorentry do_alignment_check
+END(alignment_check)
 
 ENTRY(divide_error)
 	zeroentry do_divide_error
+END(divide_error)
 
 ENTRY(spurious_interrupt_bug)
 	zeroentry do_spurious_interrupt_bug
+END(spurious_interrupt_bug)
 
 #ifdef CONFIG_X86_MCE
 	/* runs on exception stack */
@@ -1029,6 +1070,7 @@ ENTRY(machine_check)
 	paranoidentry do_machine_check
 	jmp paranoid_exit
 	CFI_ENDPROC
+END(machine_check)
 #endif
 
 ENTRY(call_softirq)
@@ -1046,3 +1088,4 @@ ENTRY(call_softirq)
 	decl %gs:pda_irqcount
 	ret
 	CFI_ENDPROC
+ENDPROC(call_softirq)
-- 
cgit v1.2.3


From 14118c3cdd46d72e503ee2f727b11d881f72f755 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 26 Jun 2006 13:56:58 +0200
Subject: [PATCH] x86_64: fix unlikely profiling & vsyscalls on x86_64

fix unlikely profiling in vsyscalls ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/vsyscall.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 9468fb20b0bc..f603037df162 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -107,7 +107,7 @@ static __always_inline long time_syscall(long *t)
 
 int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
 {
-	if (unlikely(!__sysctl_vsyscall))
+	if (!__sysctl_vsyscall)
 		return gettimeofday(tv,tz);
 	if (tv)
 		do_vgettimeofday(tv);
@@ -120,7 +120,7 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
-	if (unlikely(!__sysctl_vsyscall))
+	if (!__sysctl_vsyscall)
 		return time_syscall(t);
 	else if (t)
 		*t = __xtime.tv_sec;		
-- 
cgit v1.2.3


From 3e4ff115740c28dea463561aa1405a3c0de0d2d0 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 26 Jun 2006 13:57:01 +0200
Subject: [PATCH] x86_64: nmi watchdog header cleanup

Misc header cleanup for nmi watchdog.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 1 +
 arch/x86_64/kernel/nmi.c     | 8 --------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 15b2f6932de1..38a3ff30bde1 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -41,6 +41,7 @@
 #include <asm/mach_apic.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
+#include <asm/nmi.h>
 
 #define __apicdebuginit  __init
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 4e6357fe0ec3..ab421e22fa67 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -15,11 +15,7 @@
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
 #include <linux/nmi.h>
@@ -27,13 +23,9 @@
 #include <linux/kprobes.h>
 
 #include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
 #include <asm/nmi.h>
-#include <asm/msr.h>
 #include <asm/proto.h>
 #include <asm/kdebug.h>
-#include <asm/local.h>
 #include <asm/mce.h>
 
 /*
-- 
cgit v1.2.3


From 5c0f80fab3724aa44b3352d88155fe0eaae0e54a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:04 +0200
Subject: [PATCH] x86_64: Remove long obsolete CVS

Early development of x86-64 Linux was in CVS, but that hasn't been
the case for a long time now. Remove the obsolete $Id$s.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/aperture.c | 1 -
 arch/x86_64/kernel/e820.c     | 1 -
 arch/x86_64/kernel/head64.c   | 2 --
 arch/x86_64/kernel/process.c  | 1 -
 arch/x86_64/kernel/setup.c    | 2 --
 arch/x86_64/kernel/setup64.c  | 1 -
 arch/x86_64/kernel/signal.c   | 2 --
 arch/x86_64/kernel/traps.c    | 2 --
 8 files changed, 12 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index a7ad03ee98cf..9ea723bab16c 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -8,7 +8,6 @@
  * because only the bootmem allocator can allocate 32+MB. 
  * 
  * Copyright 2002 Andi Kleen, SuSE Labs.
- * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
  */
 #include <linux/config.h>
 #include <linux/kernel.h>
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 1ef6028f721e..d8ea250f3925 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -1,7 +1,6 @@
 /* 
  * Handle the memory map.
  * The functions here do the job until bootmem takes over.
- * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
  *
  *  Getting sanitize_e820_map() in sync with i386 version by applying change:
  *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cea20a66c150..e6a71c9556d9 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -2,8 +2,6 @@
  *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
- *
- *  $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
  */
 
 #include <linux/init.h>
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index fb903e65e079..52c03f6c7731 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -10,7 +10,6 @@
  *	Andi Kleen.
  *
  *	CPU hotplug support - ashok.raj@intel.com
- *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
  */
 
 /*
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 64640c8f5eed..a91f1d94f9e5 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -5,8 +5,6 @@
  *
  *  Nov 2001 Dave Jones <davej@suse.de>
  *  Forked from i386 setup code.
- *
- *  $Id$
  */
 
 /*
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8a691fa6d393..e5bf22a01edb 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -3,7 +3,6 @@
  * Copyright (C) 1995  Linus Torvalds
  * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
  * See setup.c for older changelog.
- * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
  */ 
 #include <linux/config.h>
 #include <linux/init.h>
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index e5f5ce7909a3..94b4b1128ac8 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -7,8 +7,6 @@
  *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
- * 
- *  $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
  */
 
 #include <linux/sched.h>
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index cea335e8746c..9f8f1eff4a6c 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -6,8 +6,6 @@
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- *  $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
  */
 
 /*
-- 
cgit v1.2.3


From 357c2b9056df447390b7df3e49960a4c609a89a9 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:57:13 +0200
Subject: [PATCH] x86_64: remove unused gart header file

include/asm-x86_64/gart-mapping.h is only ever used in
arch/x86_64/kernel/setup.c and none of its contents are referenced.
Looks to be leftover cruft not removed in the dma_ops patch.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a91f1d94f9e5..9dadb9a1db2a 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -65,7 +65,6 @@
 #include <asm/numa.h>
 #include <asm/swiotlb.h>
 #include <asm/sections.h>
-#include <asm/gart-mapping.h>
 #include <asm/dmi.h>
 
 /*
-- 
cgit v1.2.3


From 26a3c49cec96ffb9cfcc30dfa0cd05ccc25dcb3a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 26 Jun 2006 13:57:16 +0200
Subject: [PATCH] x86_64: fix vector_lock deadlock in io_apic.c

Fix a potential deadlock scenario introduced by io_apic.c's new vector_lock
on i386 and x86_64.

Found by the locking correctness validator. The patch was boot-tested on
x86. For details of the deadlock scenario, see the validator output:

  ======================================================
  [ BUG: hard-safe -> hard-unsafe lock order detected! ]
  ------------------------------------------------------
  idle/1 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
   (msi_lock){....}, at: [<c04ff8d2>] startup_msi_irq_wo_maskbit+0x10/0x35

  and this task is already holding:
   (&irq_desc[i].lock){++..}, at: [<c015b924>] probe_irq_on+0x36/0x107
  which would create a new lock dependency:
   (&irq_desc[i].lock){++..} -> (msi_lock){....}

  but this new dependency connects a hard-irq-safe lock:
   (&irq_desc[i].lock){++..}
  ... which became hard-irq-safe at:
    [<c01468c4>] lockdep_acquire+0x68/0x84
    [<c10485e9>] _spin_lock+0x21/0x2f
    [<c015aff5>] __do_IRQ+0x3d/0x113
    [<c01062d3>] do_IRQ+0x8c/0xad

  to a hard-irq-unsafe lock:
   (vector_lock){--..}
  ... which became hard-irq-unsafe at:
  ...  [<c01468c4>] lockdep_acquire+0x68/0x84
    [<c10485e9>] _spin_lock+0x21/0x2f
    [<c011b5e8>] assign_irq_vector+0x34/0xc8
    [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff
    [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa
    [<c010033f>] init+0x32/0x2cb
    [<c0102005>] kernel_thread_helper+0x5/0xb

  which could potentially lead to deadlocks!

  other info that might help us debug this:

  3 locks held by idle/1:
   #0:  (port_mutex){--..}, at: [<c067070d>] uart_add_one_port+0x61/0x289
   #1:  (&state->mutex){--..}, at: [<c067071f>] uart_add_one_port+0x73/0x289
   #2:  (&irq_desc[i].lock){++..}, at: [<c015b924>] probe_irq_on+0x36/0x107

  the hard-irq-safe lock's dependencies:
  -> (&irq_desc[i].lock){++..} ops: 9861 {
     initial-use  at:
                          [<c01468c4>] lockdep_acquire+0x68/0x84
                          [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                          [<c015b415>] setup_irq+0x9b/0x14d
                          [<c1aaa4c4>] time_init_hook+0xf/0x11
                          [<c1a9f320>] time_init+0x44/0x46
                          [<c1a9955f>] start_kernel+0x191/0x38f
                          [<c0100210>] 0xc0100210
     in-hardirq-W at:
                          [<c01468c4>] lockdep_acquire+0x68/0x84
                          [<c10485e9>] _spin_lock+0x21/0x2f
                          [<c015aff5>] __do_IRQ+0x3d/0x113
                          [<c01062d3>] do_IRQ+0x8c/0xad
     in-softirq-W at:
                          [<c01468c4>] lockdep_acquire+0x68/0x84
                          [<c10485e9>] _spin_lock+0x21/0x2f
                          [<c015aff5>] __do_IRQ+0x3d/0x113
                          [<c01062d3>] do_IRQ+0x8c/0xad
   }
   ... key      at: [<c1ea31e0>] irq_desc_lock_type+0x0/0x20
    -> (i8259A_lock){++..} ops: 5149 {
       initial-use  at:
                        [<c01468c4>] lockdep_acquire+0x68/0x84
                        [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                        [<c0108090>] init_8259A+0x11/0x8f
                        [<c1aa0d22>] init_ISA_irqs+0x12/0x4d
                        [<c1aaa4f0>] pre_intr_init_hook+0x8/0xa
                        [<c1aa0cb9>] init_IRQ+0xe/0x65
                        [<c1a99546>] start_kernel+0x178/0x38f
                        [<c0100210>] 0xc0100210
       in-hardirq-W at:
                        [<c01468c4>] lockdep_acquire+0x68/0x84
                        [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                        [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc
                        [<c015b007>] __do_IRQ+0x4f/0x113
                        [<c01062d3>] do_IRQ+0x8c/0xad
       in-softirq-W at:
                        [<c01468c4>] lockdep_acquire+0x68/0x84
                        [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                        [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc
                        [<c015b007>] __do_IRQ+0x4f/0x113
                        [<c01062d3>] do_IRQ+0x8c/0xad
     }
     ... key      at: [<c142f174>] i8259A_lock+0x14/0x40
   ... acquired at:
     [<c01468c4>] lockdep_acquire+0x68/0x84
     [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
     [<c0107eb2>] enable_8259A_irq+0x10/0x47
     [<c0107f12>] startup_8259A_irq+0x8/0xc
     [<c015b45e>] setup_irq+0xe4/0x14d
     [<c1aaa4c4>] time_init_hook+0xf/0x11
     [<c1a9f320>] time_init+0x44/0x46
     [<c1a9955f>] start_kernel+0x191/0x38f
     [<c0100210>] 0xc0100210

    -> (ioapic_lock){+...} ops: 122 {
       initial-use  at:
                        [<c01468c4>] lockdep_acquire+0x68/0x84
                        [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                        [<c1aa71db>] io_apic_get_version+0x16/0x55
                        [<c1aa5c73>] mp_register_ioapic+0xc6/0x127
                        [<c1aa382e>] acpi_parse_ioapic+0x2d/0x39
                        [<c1abe031>] acpi_table_parse_madt_family+0xb4/0x100
                        [<c1abe093>] acpi_table_parse_madt+0x16/0x18
                        [<c1aa3c8a>] acpi_boot_init+0x132/0x251
                        [<c1aa08ea>] setup_arch+0xd36/0xe37
                        [<c1a99434>] start_kernel+0x66/0x38f
                        [<c0100210>] 0xc0100210
       in-hardirq-W at:
                        [<c01468c4>] lockdep_acquire+0x68/0x84
                        [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                        [<c011bce1>] mask_IO_APIC_irq+0x11/0x31
                        [<c011c5cc>] ack_edge_ioapic_vector+0x31/0x41
                        [<c015b007>] __do_IRQ+0x4f/0x113
                        [<c01062d3>] do_IRQ+0x8c/0xad
     }
     ... key      at: [<c1432514>] ioapic_lock+0x14/0x3c
      -> (i8259A_lock){++..} ops: 5149 {
         initial-use  at:
                         [<c01468c4>] lockdep_acquire+0x68/0x84
                         [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                         [<c0108090>] init_8259A+0x11/0x8f
                         [<c1aa0d22>] init_ISA_irqs+0x12/0x4d
                         [<c1aaa4f0>] pre_intr_init_hook+0x8/0xa
                         [<c1aa0cb9>] init_IRQ+0xe/0x65
                         [<c1a99546>] start_kernel+0x178/0x38f
                         [<c0100210>] 0xc0100210
         in-hardirq-W at:
                         [<c01468c4>] lockdep_acquire+0x68/0x84
                         [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                         [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc
                         [<c015b007>] __do_IRQ+0x4f/0x113
                         [<c01062d3>] do_IRQ+0x8c/0xad
         in-softirq-W at:
                         [<c01468c4>] lockdep_acquire+0x68/0x84
                         [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
                         [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc
                         [<c015b007>] __do_IRQ+0x4f/0x113
                         [<c01062d3>] do_IRQ+0x8c/0xad
       }
       ... key      at: [<c142f174>] i8259A_lock+0x14/0x40
     ... acquired at:
     [<c01468c4>] lockdep_acquire+0x68/0x84
     [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
     [<c0107e6b>] disable_8259A_irq+0x10/0x47
     [<c011bdbd>] startup_edge_ioapic_vector+0x31/0x58
     [<c015b45e>] setup_irq+0xe4/0x14d
     [<c015b5a1>] request_irq+0xda/0xf9
     [<c1ac983a>] rtc_init+0x6a/0x1a7
     [<c0100457>] init+0x14a/0x2cb
     [<c0102005>] kernel_thread_helper+0x5/0xb

   ... acquired at:
     [<c01468c4>] lockdep_acquire+0x68/0x84
     [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
     [<c011bce1>] mask_IO_APIC_irq+0x11/0x31
     [<c011c5cc>] ack_edge_ioapic_vector+0x31/0x41
     [<c015b007>] __do_IRQ+0x4f/0x113
     [<c01062d3>] do_IRQ+0x8c/0xad

  the hard-irq-unsafe lock's dependencies:
  -> (vector_lock){--..} ops: 31 {
     initial-use  at:
                          [<c01468c4>] lockdep_acquire+0x68/0x84
                          [<c10485e9>] _spin_lock+0x21/0x2f
                          [<c011b5e8>] assign_irq_vector+0x34/0xc8
                          [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff
                          [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa
                          [<c010033f>] init+0x32/0x2cb
                          [<c0102005>] kernel_thread_helper+0x5/0xb
     softirq-on-W at:
                          [<c01468c4>] lockdep_acquire+0x68/0x84
                          [<c10485e9>] _spin_lock+0x21/0x2f
                          [<c011b5e8>] assign_irq_vector+0x34/0xc8
                          [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff
                          [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa
                          [<c010033f>] init+0x32/0x2cb
                          [<c0102005>] kernel_thread_helper+0x5/0xb
     hardirq-on-W at:
                          [<c01468c4>] lockdep_acquire+0x68/0x84
                          [<c10485e9>] _spin_lock+0x21/0x2f
                          [<c011b5e8>] assign_irq_vector+0x34/0xc8
                          [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff
                          [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa
                          [<c010033f>] init+0x32/0x2cb
                          [<c0102005>] kernel_thread_helper+0x5/0xb
   }
   ... key      at: [<c1432574>] vector_lock+0x14/0x3c

  stack backtrace:
   [<c0104f36>] show_trace+0xd/0xf
   [<c010543e>] dump_stack+0x17/0x19
   [<c0144e34>] check_usage+0x1f6/0x203
   [<c0146395>] __lockdep_acquire+0x8c2/0xaa5
   [<c01468c4>] lockdep_acquire+0x68/0x84
   [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a
   [<c04ff8d2>] startup_msi_irq_wo_maskbit+0x10/0x35
   [<c015b932>] probe_irq_on+0x44/0x107
   [<c0673d58>] serial8250_config_port+0x84b/0x986
   [<c06707b1>] uart_add_one_port+0x105/0x289
   [<c1ace54b>] serial8250_init+0xc3/0x10a
   [<c0100457>] init+0x14a/0x2cb
   [<c0102005>] kernel_thread_helper+0x5/0xb

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 38a3ff30bde1..519cd4e6f9e7 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -836,14 +836,15 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 int assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+	unsigned long flags;
 	int vector;
 
 	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
 
-	spin_lock(&vector_lock);
+	spin_lock_irqsave(&vector_lock, flags);
 
 	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
-		spin_unlock(&vector_lock);
+		spin_unlock_irqrestore(&vector_lock, flags);
 		return IO_APIC_VECTOR(irq);
 	}
 next:
@@ -862,7 +863,7 @@ next:
 	if (irq != AUTO_ASSIGN)
 		IO_APIC_VECTOR(irq) = vector;
 
-	spin_unlock(&vector_lock);
+	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return vector;
 }
-- 
cgit v1.2.3


From 4d9bc79cd28b779610d9590b3a96a28a0f64a25a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:19 +0200
Subject: [PATCH] x86_64: Make sure is_compat_task works early

Previously it would only work in the first 32bit system call, not during
early process setup.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 52c03f6c7731..dcb77b462348 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -364,8 +364,11 @@ void flush_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_info *t = current_thread_info();
 
-	if (t->flags & _TIF_ABI_PENDING)
+	if (t->flags & _TIF_ABI_PENDING) {
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
+		if (t->flags & _TIF_IA32)
+			current_thread_info()->status |= TS_COMPAT;
+	}
 
 	tsk->thread.debugreg0 = 0;
 	tsk->thread.debugreg1 = 0;
-- 
cgit v1.2.3


From a813ce432f27c4f5011c7b5ac9d2bbbfeb41d9a7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:57:22 +0200
Subject: [PATCH] x86_64: Rename IOMMU option, fix help and mark option
 embedded.

 - Rename the GART_IOMMU option to IOMMU to make clear it's not
   just for AMD
 - Rewrite the help text to better emphatise this fact
 - Make it an embedded option because too many people get it wrong.

To my astonishment I discovered the aacraid driver tests this
symbol directly. This looks quite broken to me - it's an internal
implementation detail of the PCI DMA API. Can the maintainer
please clarify what this test was intended to do?

Cc: linux-scsi@vger.kernel.org
Cc: alan@redhat.com
Cc: markh@osdl.org
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile  | 2 +-
 arch/x86_64/kernel/io_apic.c | 2 +-
 arch/x86_64/kernel/pci-dma.c | 2 +-
 arch/x86_64/kernel/setup.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index f927d11065fe..fd106bdddd6d 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -28,7 +28,7 @@ obj-$(CONFIG_PM)		+= suspend.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
+obj-$(CONFIG_IOMMU)		+= pci-gart.o aperture.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 519cd4e6f9e7..c768d8a036d0 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -319,7 +319,7 @@ void __init check_ioapic(void)
 				vendor &= 0xffff;
 				switch (vendor) { 
 				case PCI_VENDOR_ID_VIA:
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 					if ((end_pfn > MAX_DMA32_PFN ||
 					     force_iommu) &&
 					    !iommu_aperture_allowed) {
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 117bcebe9ebc..33926c367250 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -266,7 +266,7 @@ __init int iommu_setup(char *p)
 		    swiotlb = 1;
 #endif
 
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 	    gart_parse_options(p);
 #endif
 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 9dadb9a1db2a..816203d57e11 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -702,7 +702,7 @@ void __init setup_arch(char **cmdline_p)
 
 	e820_setup_gap();
 
-#ifdef CONFIG_GART_IOMMU
+#ifdef CONFIG_IOMMU
 	iommu_hole_init();
 #endif
 
-- 
cgit v1.2.3


From 2b28592b07223d7fc0691ce3fe57d495dc9cbe3a Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Mon, 26 Jun 2006 13:57:25 +0200
Subject: [PATCH] x86_64: x86_86 msi miss one entry handler

  In x86_64 architecture, if device driver with msi function
gets 0xee vector by assign_irq_vector() function, system will
crash if this interrupt happens. It is because 0xee interrupt
entry is empty. This patch modifies this. This patch is based
on 2.6.17-rc6.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 5ecd34ab8c2b..9b1a4e147321 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -44,11 +44,11 @@
 	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
 	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
 
-#define BUILD_14_IRQS(x) \
+#define BUILD_15_IRQS(x) \
 	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
 	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
 	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-	BI(x,c) BI(x,d)
+	BI(x,c) BI(x,d) BI(x,e)
 
 /*
  * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -73,13 +73,13 @@ BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
 BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
 
 #ifdef CONFIG_PCI_MSI
-	BUILD_14_IRQS(0xe)
+	BUILD_15_IRQS(0xe)
 #endif
 
 #endif
 
 #undef BUILD_16_IRQS
-#undef BUILD_14_IRQS
+#undef BUILD_15_IRQS
 #undef BI
 
 
@@ -92,11 +92,11 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
 	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
 	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 
-#define IRQLIST_14(x) \
+#define IRQLIST_15(x) \
 	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
 	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
 	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-	IRQ(x,c), IRQ(x,d)
+	IRQ(x,c), IRQ(x,d), IRQ(x,e)
 
 void (*interrupt[NR_IRQS])(void) = {
 	IRQLIST_16(0x0),
@@ -108,7 +108,7 @@ void (*interrupt[NR_IRQS])(void) = {
 	IRQLIST_16(0xc), IRQLIST_16(0xd)
 
 #ifdef CONFIG_PCI_MSI
-	, IRQLIST_14(0xe)
+	, IRQLIST_15(0xe)
 #endif
 
 #endif
-- 
cgit v1.2.3


From b538ed278b80641ee6d7b75497d5e14de1bb1137 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:32 +0200
Subject: [PATCH] x86_64: reliable stack trace support (x86-64)

These are the x86_64-specific pieces to enable reliable stack traces. The
only restriction with this is that it currently cannot unwind across the
interrupt->normal stack boundary, as that transition is lacking proper
annotation.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S       | 33 ++++++++++++++++++++++++
 arch/x86_64/kernel/process.c     |  2 +-
 arch/x86_64/kernel/traps.c       | 54 +++++++++++++++++++++++++++++++++++-----
 arch/x86_64/kernel/vmlinux.lds.S |  9 +++++++
 4 files changed, 91 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 9999d703b6c4..6c68beec51b3 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -1089,3 +1089,36 @@ ENTRY(call_softirq)
 	ret
 	CFI_ENDPROC
 ENDPROC(call_softirq)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+	CFI_STARTPROC
+	movq	%r15, R15(%rdi)
+	movq	%r14, R14(%rdi)
+	xchgq	%rsi, %rdx
+	movq	%r13, R13(%rdi)
+	movq	%r12, R12(%rdi)
+	xorl	%eax, %eax
+	movq	%rbp, RBP(%rdi)
+	movq	%rbx, RBX(%rdi)
+	movq	(%rsp), %rcx
+	movq	%rax, R11(%rdi)
+	movq	%rax, R10(%rdi)
+	movq	%rax, R9(%rdi)
+	movq	%rax, R8(%rdi)
+	movq	%rax, RAX(%rdi)
+	movq	%rax, RCX(%rdi)
+	movq	%rax, RDX(%rdi)
+	movq	%rax, RSI(%rdi)
+	movq	%rax, RDI(%rdi)
+	movq	%rax, ORIG_RAX(%rdi)
+	movq	%rcx, RIP(%rdi)
+	leaq	8(%rsp), %rcx
+	movq	$__KERNEL_CS, CS(%rdi)
+	movq	%rax, EFLAGS(%rdi)
+	movq	%rcx, RSP(%rdi)
+	movq	$__KERNEL_DS, SS(%rdi)
+	jmpq	*%rdx
+	CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index dcb77b462348..d6fa41459c80 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -334,7 +334,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(&regs->rsp);
+	show_trace(NULL, regs, (void *)(regs + 1));
 }
 
 /*
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 9f8f1eff4a6c..eb1534ff1f5f 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -29,6 +29,7 @@
 #include <linux/nmi.h>
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
+#include <linux/unwind.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -39,7 +40,7 @@
 #include <asm/i387.h>
 #include <asm/kdebug.h>
 #include <asm/processor.h>
-
+#include <asm/unwind.h>
 #include <asm/smp.h>
 #include <asm/pgalloc.h>
 #include <asm/pda.h>
@@ -189,6 +190,23 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	return NULL;
 }
 
+static void show_trace_unwind(struct unwind_frame_info *info, void *context)
+{
+	int i = 11;
+
+	while (unwind(info) == 0 && UNW_PC(info)) {
+		if (i > 50) {
+			printk("\n       ");
+			i = 7;
+		} else
+			i += printk(" ");
+		i += printk_address(UNW_PC(info));
+		if (arch_unw_user_mode(info))
+			break;
+	}
+	printk("\n");
+}
+
 /*
  * x86-64 can have upto three kernel stacks: 
  * process stack
@@ -196,15 +214,34 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void show_trace(unsigned long *stack)
+void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	int i;
 	unsigned used = 0;
+	struct unwind_frame_info info;
 
 	printk("\nCall Trace:");
 
+	if (!tsk)
+		tsk = current;
+
+	if (regs) {
+		if (unwind_init_frame_info(&info, tsk, regs) == 0) {
+			show_trace_unwind(&info, NULL);
+			return;
+		}
+	} else if (tsk == current) {
+		if (unwind_init_running(&info, show_trace_unwind, NULL) == 0)
+			return;
+	} else {
+		if (unwind_init_blocked(&info, tsk) == 0) {
+			show_trace_unwind(&info, NULL);
+			return;
+		}
+	}
+
 #define HANDLE_STACK(cond) \
 	do while (cond) { \
 		unsigned long addr = *stack++; \
@@ -262,7 +299,7 @@ void show_trace(unsigned long *stack)
 	printk("\n");
 }
 
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
 {
 	unsigned long *stack;
 	int i;
@@ -296,7 +333,12 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
 		printk("%016lx ", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace((unsigned long *)rsp);
+	show_trace(tsk, regs, rsp);
+}
+
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+	_show_stack(tsk, NULL, rsp);
 }
 
 /*
@@ -305,7 +347,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
 void dump_stack(void)
 {
 	unsigned long dummy;
-	show_trace(&dummy);
+	show_trace(NULL, NULL, &dummy);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -332,7 +374,7 @@ void show_registers(struct pt_regs *regs)
 	if (in_kernel) {
 
 		printk("Stack: ");
-		show_stack(NULL, (unsigned long*)rsp);
+		_show_stack(NULL, regs, (unsigned long*)rsp);
 
 		printk("\nCode: ");
 		if (regs->rip < PAGE_OFFSET)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 5968c2415da9..1c6a5f322919 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -45,6 +45,15 @@ SECTIONS
 
   RODATA
 
+#ifdef CONFIG_STACK_UNWIND
+  . = ALIGN(8);
+  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
+	__start_unwind = .;
+  	*(.eh_frame)
+	__end_unwind = .;
+  }
+#endif
+
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
-- 
cgit v1.2.3


From 1de9c3f67ec6fddfb85325bab551f4094f55f410 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:35 +0200
Subject: [PATCH] x86_64: reliable stack trace support (x86-64 IRQ stack

Change the switching to/from the IRQ stack so that unwind annotations can
be added for it without requiring CFA expressions.

AK: I cleaned it up a bit, making it unconditional and removing the
obsolete DEBUG_INFO full frame code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 6c68beec51b3..4f407ef6b865 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -473,29 +473,18 @@ END(stub_rt_sigreturn)
 /* 0(%rsp): interrupt number */ 
 	.macro interrupt func
 	cld
-#ifdef CONFIG_DEBUG_INFO
-	SAVE_ALL	
-	movq %rsp,%rdi
-	/*
-	 * Setup a stack frame pointer.  This allows gdb to trace
-	 * back to the original stack.
-	 */
-	movq %rsp,%rbp
-	CFI_DEF_CFA_REGISTER	rbp
-#else		
 	SAVE_ARGS
 	leaq -ARGOFFSET(%rsp),%rdi	# arg1 for handler
-#endif	
+	pushq %rbp
+	CFI_ADJUST_CFA_OFFSET	8
+	CFI_REL_OFFSET		rbp, 0
+	movq %rsp,%rbp
+	CFI_DEF_CFA_REGISTER	rbp
 	testl $3,CS(%rdi)
 	je 1f
 	swapgs	
 1:	incl	%gs:pda_irqcount	# RED-PEN should check preempt count
-	movq %gs:pda_irqstackptr,%rax
-	cmoveq %rax,%rsp /*todo This needs CFI annotation! */
-	pushq %rdi			# save old stack	
-#ifndef CONFIG_DEBUG_INFO
-	CFI_ADJUST_CFA_OFFSET	8
-#endif
+	cmoveq %gs:pda_irqstackptr,%rsp
 	call \func
 	.endm
 
@@ -504,17 +493,11 @@ ENTRY(common_interrupt)
 	interrupt do_IRQ
 	/* 0(%rsp): oldrsp-ARGOFFSET */
 ret_from_intr:
-	popq  %rdi
-#ifndef CONFIG_DEBUG_INFO
-	CFI_ADJUST_CFA_OFFSET	-8
-#endif
 	cli	
 	decl %gs:pda_irqcount
-#ifdef CONFIG_DEBUG_INFO
-	movq RBP(%rdi),%rbp
+	leaveq
 	CFI_DEF_CFA_REGISTER	rsp
-#endif
-	leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */
+	CFI_ADJUST_CFA_OFFSET	-8
 exit_intr:
 	GET_THREAD_INFO(%rcx)
 	testl $3,CS-ARGOFFSET(%rsp)
-- 
cgit v1.2.3


From dffead4e421e289c8434351400d24fd35723e874 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:38 +0200
Subject: [PATCH] x86_64: reliable stack trace support (x86-64 syscall

Adjust the CFA offset for 64- and 32-bit syscall entries so that the five
slots pre-subtracted from the stack pointer do not appear to reside outside
of the current frame.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 4f407ef6b865..7290e72b9a34 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -189,7 +189,7 @@ END(ret_from_fork)
 
 ENTRY(system_call)
 	CFI_STARTPROC	simple
-	CFI_DEF_CFA	rsp,0
+	CFI_DEF_CFA	rsp,PDA_STACKOFFSET
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	swapgs
-- 
cgit v1.2.3


From c33bd9aac0597eeedaaa01ea5aafe456894b2f2b Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:47 +0200
Subject: [PATCH] i386/x86-64: fall back to old-style call trace if no
 unwinding

If no unwinding is possible at all for a certain exception instance,
fall back to the old style call trace instead of not showing any trace
at all.

Also, allow setting the stack trace mode at the command line.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 51 +++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 16 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index eb1534ff1f5f..bd0891f4c2c7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -107,6 +107,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 }
 
 static int kstack_depth_to_print = 10;
+static int call_trace = 1;
 
 #ifdef CONFIG_KALLSYMS
 #include <linux/kallsyms.h> 
@@ -190,11 +191,12 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	return NULL;
 }
 
-static void show_trace_unwind(struct unwind_frame_info *info, void *context)
+static int show_trace_unwind(struct unwind_frame_info *info, void *context)
 {
-	int i = 11;
+	int i = 11, n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
+		++n;
 		if (i > 50) {
 			printk("\n       ");
 			i = 7;
@@ -205,6 +207,7 @@ static void show_trace_unwind(struct unwind_frame_info *info, void *context)
 			break;
 	}
 	printk("\n");
+	return n;
 }
 
 /*
@@ -218,27 +221,32 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
-	int i;
+	int i = 11;
 	unsigned used = 0;
-	struct unwind_frame_info info;
 
 	printk("\nCall Trace:");
 
 	if (!tsk)
 		tsk = current;
 
-	if (regs) {
-		if (unwind_init_frame_info(&info, tsk, regs) == 0) {
-			show_trace_unwind(&info, NULL);
-			return;
+	if (call_trace >= 0) {
+		int unw_ret = 0;
+		struct unwind_frame_info info;
+
+		if (regs) {
+			if (unwind_init_frame_info(&info, tsk, regs) == 0)
+				unw_ret = show_trace_unwind(&info, NULL);
+		} else if (tsk == current)
+			unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
+		else {
+			if (unwind_init_blocked(&info, tsk) == 0)
+				unw_ret = show_trace_unwind(&info, NULL);
 		}
-	} else if (tsk == current) {
-		if (unwind_init_running(&info, show_trace_unwind, NULL) == 0)
-			return;
-	} else {
-		if (unwind_init_blocked(&info, tsk) == 0) {
-			show_trace_unwind(&info, NULL);
-			return;
+		if (unw_ret > 0) {
+			if (call_trace > 0)
+				return;
+			printk("Legacy call trace:");
+			i = 18;
 		}
 	}
 
@@ -264,7 +272,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		} \
 	} while (0)
 
-	for(i = 11; ; ) {
+	for(; ; ) {
 		const char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
@@ -1052,3 +1060,14 @@ static int __init kstack_setup(char *s)
 }
 __setup("kstack=", kstack_setup);
 
+static int __init call_trace_setup(char *s)
+{
+	if (strcmp(s, "old") == 0)
+		call_trace = -1;
+	else if (strcmp(s, "both") == 0)
+		call_trace = 0;
+	else if (strcmp(s, "new") == 0)
+		call_trace = 1;
+	return 1;
+}
+__setup("call_trace=", call_trace_setup);
-- 
cgit v1.2.3


From 46d13a384bc695ec61458e5dcbac1eee6d623a9b Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:57:59 +0200
Subject: [PATCH] x86_64: use halt() instead of raw inline assembly

Use abstractions whenever possible.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/crash.c | 2 +-
 arch/x86_64/kernel/smp.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 4e6c3b729e39..ec1c7431d5af 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -111,7 +111,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
 	for(;;)
-		asm("hlt");
+		halt();
 
 	return 1;
 }
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 4a6628b14d99..fe906ba1cf45 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -460,7 +460,7 @@ static void smp_really_stop_cpu(void *dummy)
 {
 	smp_stop_cpu(); 
 	for (;;) 
-		asm("hlt"); 
+		halt();
 } 
 
 void smp_send_stop(void)
-- 
cgit v1.2.3


From 2bbc419f9d51b44f3fdeea12c5b786bdace82b8e Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohitseth@google.com>
Date: Mon, 26 Jun 2006 13:58:02 +0200
Subject: [PATCH] x86_64: Change assembly to use regular cpuid_count macro

Minor cleanup patch:

Replacing the asm statement with cpuid_count macro(which already
provides the same functionality).

Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 816203d57e11..a9de8f02671f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -957,15 +957,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  */
 static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 {
-	unsigned int eax;
+	unsigned int eax, t;
 
 	if (c->cpuid_level < 4)
 		return 1;
 
-	__asm__("cpuid"
-		: "=a" (eax)
-		: "0" (4), "c" (0)
-		: "bx", "dx");
+	cpuid_count(4, 0, &eax, &t, &t, &t);
 
 	if (eax & 0x1f)
 		return ((eax >> 26) + 1);
-- 
cgit v1.2.3


From 8d4f6b93a4aaa6b56b600cd1165c971f4395e4b3 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:05 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - introduce iommu_detected

swiotlb relies on the gart specific iommu_aperture variable to know if
we discovered a hardware IOMMU before swiotlb initialization.  Introduce
iommu_detected to do the same thing, but in a HW IOMMU neutral manner,
in preparation for adding the Calgary HW IOMMU.

Signed-Off-By: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-Off-By: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/aperture.c    | 1 +
 arch/x86_64/kernel/pci-dma.c     | 3 +++
 arch/x86_64/kernel/pci-gart.c    | 4 ++++
 arch/x86_64/kernel/pci-swiotlb.c | 2 +-
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 9ea723bab16c..a195ef06ec55 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -209,6 +209,7 @@ void __init iommu_hole_init(void)
 		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
 			continue;
 
+		iommu_detected = 1;
 		iommu_aperture = 1; 
 
 		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 33926c367250..7edd1a40fab3 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -33,6 +33,9 @@ int panic_on_overflow __read_mostly = 0;
 int force_iommu __read_mostly= 0;
 #endif
 
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
 /* Dummy device used for NULL arguments (normally ISA). Better would
    be probably a smaller DMA mask, but this is bug-to-bug compatible
    to i386. */
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 82a346e6e2e4..4f67957d2b42 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -597,6 +597,10 @@ static int __init pci_iommu_init(void)
 	if (swiotlb)
 		return -ENODEV;
 
+	/* Did we detect a different HW IOMMU? */
+	if (iommu_detected && !iommu_aperture)
+		return -1;
+
 	if (no_iommu ||
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
 	    !iommu_aperture ||
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index 990ed67896f2..ebdb77fe2057 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -31,7 +31,7 @@ struct dma_mapping_ops swiotlb_dma_ops = {
 void pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-	if (!iommu_aperture && !no_iommu &&
+	if (!iommu_detected && !no_iommu &&
 	    (end_pfn > MAX_DMA32_PFN || force_iommu))
 	       swiotlb = 1;
 	if (swiotlb) {
-- 
cgit v1.2.3


From 0dc243ae10c8309c170a3af9f1adad1924a9f217 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:11 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - IOMMU abstractions

This patch creates a new interface for IOMMUs by adding a centralized
location for IOMMU allocation (for translation tables/apertures) and
IOMMU initialization.  In creating these, code was moved around for
abstraction, uniformity, and consiceness.

Take note of the move of the iommu_setup bootarg parsing code to
__setup.  This is enabled by moving back the location of the aperture
allocation/detection to mem init (which while ugly, was already the
location of the swiotlb_init).

While a slight departure from the previous patch, I belive this provides
the true intention of the previous versions of the patch which changed
this code.  It also makes the addition of the upcoming calgary code much
cleaner than previous patches.

[AK: Removed one broken change. iommu_setup still has to be called
early]

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-dma.c  | 29 +++++++++++++++++++++++++++++
 arch/x86_64/kernel/pci-gart.c | 13 ++++++-------
 arch/x86_64/kernel/setup.c    |  5 -----
 3 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 7edd1a40fab3..a45844c7e3a3 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -279,3 +279,32 @@ __init int iommu_setup(char *p)
     }
     return 1;
 }
+__setup("iommu=", iommu_setup);
+
+void __init pci_iommu_alloc(void)
+{
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+#ifdef CONFIG_IOMMU
+	iommu_hole_init();
+#endif
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_IOMMU
+	gart_iommu_init();
+#endif
+
+	no_iommu_init();
+	return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 4f67957d2b42..9a93954bed37 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -571,7 +571,7 @@ static struct dma_mapping_ops gart_dma_ops = {
 	.unmap_sg = gart_unmap_sg,
 };
 
-static int __init pci_iommu_init(void)
+void __init gart_iommu_init(void)
 { 
 	struct agp_kern_info info;
 	unsigned long aper_size;
@@ -581,7 +581,7 @@ static int __init pci_iommu_init(void)
 
 	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
 		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
-		return -ENODEV;
+		return;
 	}
 
 #ifndef CONFIG_AGP_AMD64
@@ -595,11 +595,11 @@ static int __init pci_iommu_init(void)
 #endif	
 
 	if (swiotlb)
-		return -ENODEV;
+		return;
 
 	/* Did we detect a different HW IOMMU? */
 	if (iommu_detected && !iommu_aperture)
-		return -1;
+		return;
 
 	if (no_iommu ||
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
@@ -611,7 +611,7 @@ static int __init pci_iommu_init(void)
 					"but IOMMU not available.\n"
 			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
 		}
-		return -ENODEV;
+		return;
 	}
 
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
@@ -678,11 +678,10 @@ static int __init pci_iommu_init(void)
 
 	flush_gart();
 	dma_ops = &gart_dma_ops;
-	return 0;
 } 
 
 /* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
+fs_initcall(gart_iommu_init);
 
 void gart_parse_options(char *p)
 {
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a9de8f02671f..04b2d7b92d17 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -63,7 +63,6 @@
 #include <asm/setup.h>
 #include <asm/mach_apic.h>
 #include <asm/numa.h>
-#include <asm/swiotlb.h>
 #include <asm/sections.h>
 #include <asm/dmi.h>
 
@@ -702,10 +701,6 @@ void __init setup_arch(char **cmdline_p)
 
 	e820_setup_gap();
 
-#ifdef CONFIG_IOMMU
-	iommu_hole_init();
-#endif
-
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	conswitchp = &vga_con;
-- 
cgit v1.2.3


From e465058d55a88feb4c7ecabe63eea7ea7147e206 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 26 Jun 2006 13:58:14 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - Calgary specific bits

This patch hooks Calgary into the build, the x86-64 IOMMU
initialization paths, and introduces the Calgary specific bits.  The
implementation draws inspiration from both PPC (which has support for
the same chip but requires firmware support which we don't have on
x86-64) and gart. Calgary is different from gart in that it support a
translation table per PHB, as opposed to the single gart aperture.

Changes from previous version:
 * Addition of boot-time disablement for bus-level translation/isolation
   (e.g, enable userspace DMA for things like X)
 * Usage of newer IOMMU abstraction functions

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile      |    1 +
 arch/x86_64/kernel/pci-calgary.c | 1018 ++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/pci-dma.c     |    9 +
 arch/x86_64/kernel/tce.c         |  202 ++++++++
 4 files changed, 1230 insertions(+)
 create mode 100644 arch/x86_64/kernel/pci-calgary.c
 create mode 100644 arch/x86_64/kernel/tce.c

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index fd106bdddd6d..aeb9c560be88 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_IOMMU)		+= pci-gart.o aperture.o
+obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary.o tce.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
new file mode 100644
index 000000000000..d91cb843f54d
--- /dev/null
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -0,0 +1,1018 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
+ * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <asm/proto.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/system.h>
+#include <asm/dma.h>
+
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_VENDOR_DEVICE_ID_CALGARY \
+	(PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16)
+
+/* we need these for register space address calculation */
+#define START_ADDRESS           0xfe000000
+#define CHASSIS_BASE            0
+#define ONE_BASED_CHASSIS_NUM   1
+
+/* register offsets inside the host bridge space */
+#define PHB_CSR_OFFSET		0x0110
+#define PHB_PLSSR_OFFSET	0x0120
+#define PHB_CONFIG_RW_OFFSET	0x0160
+#define PHB_IOBASE_BAR_LOW	0x0170
+#define PHB_IOBASE_BAR_HIGH	0x0180
+#define PHB_MEM_1_LOW		0x0190
+#define PHB_MEM_1_HIGH		0x01A0
+#define PHB_IO_ADDR_SIZE	0x01B0
+#define PHB_MEM_1_SIZE		0x01C0
+#define PHB_MEM_ST_OFFSET	0x01D0
+#define PHB_AER_OFFSET		0x0200
+#define PHB_CONFIG_0_HIGH	0x0220
+#define PHB_CONFIG_0_LOW	0x0230
+#define PHB_CONFIG_0_END	0x0240
+#define PHB_MEM_2_LOW		0x02B0
+#define PHB_MEM_2_HIGH		0x02C0
+#define PHB_MEM_2_SIZE_HIGH	0x02D0
+#define PHB_MEM_2_SIZE_LOW	0x02E0
+#define PHB_DOSHOLE_OFFSET	0x08E0
+
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE		0x20000000
+#define PHB_SLOT_DISABLE	0x1C000000
+#define PHB_DAC_DISABLE		0x01000000
+#define PHB_MEM2_ENABLE		0x00400000
+#define PHB_MCSR_ENABLE		0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS		0x0000ffffffff800fUL
+#define TAR_VALID		0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK		0xffe0ffff
+
+#define MAX_NUM_OF_PHBS		8 /* how many PHBs in total? */
+#define MAX_PHB_BUS_NUM		(MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */
+#define PHBS_PER_CALGARY	4
+
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+	0x0580 /* TAR0 */,
+	0x0588 /* TAR1 */,
+	0x0590 /* TAR2 */,
+	0x0598 /* TAR3 */
+};
+
+static const unsigned long split_queue_offsets[] = {
+	0x4870 /* SPLIT QUEUE 0 */,
+	0x5870 /* SPLIT QUEUE 1 */,
+	0x6870 /* SPLIT QUEUE 2 */,
+	0x7870 /* SPLIT QUEUE 3 */
+};
+
+static const unsigned long phb_offsets[] = {
+	0x8000 /* PHB0 */,
+	0x9000 /* PHB1 */,
+	0xA000 /* PHB2 */,
+	0xB000 /* PHB3 */
+};
+
+void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES];
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+
+/*
+ * the bitmap of PHBs the user requested that we disable
+ * translation on.
+ */
+static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM);
+
+static void tce_cache_blast(struct iommu_table *tbl);
+
+/* enable this to stress test the chip's TCE cache */
+#ifdef CONFIG_IOMMU_DEBUG
+static inline void tce_cache_blast_stress(struct iommu_table *tbl)
+{
+	tce_cache_blast(tbl);
+}
+#else
+static inline void tce_cache_blast_stress(struct iommu_table *tbl)
+{
+}
+#endif /* BLAST_TCE_CACHE_ON_UNMAP */
+
+static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
+{
+	unsigned int npages;
+
+	npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
+	npages >>= PAGE_SHIFT;
+
+	return npages;
+}
+
+static inline int translate_phb(struct pci_dev* dev)
+{
+	int disabled = test_bit(dev->bus->number, translation_disabled);
+	return !disabled;
+}
+
+static void iommu_range_reserve(struct iommu_table *tbl,
+        unsigned long start_addr, unsigned int npages)
+{
+	unsigned long index;
+	unsigned long end;
+
+	index = start_addr >> PAGE_SHIFT;
+
+	/* bail out if we're asked to reserve a region we don't cover */
+	if (index >= tbl->it_size)
+		return;
+
+	end = index + npages;
+	if (end > tbl->it_size) /* don't go off the table */
+		end = tbl->it_size;
+
+	while (index < end) {
+		if (test_bit(index, tbl->it_map))
+			printk(KERN_ERR "Calgary: entry already allocated at "
+			       "0x%lx tbl %p dma 0x%lx npages %u\n",
+			       index, tbl, start_addr, npages);
+		++index;
+	}
+	set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages);
+}
+
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+	unsigned int npages)
+{
+	unsigned long offset;
+
+	BUG_ON(npages == 0);
+
+	offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
+				       tbl->it_size, npages);
+	if (offset == ~0UL) {
+		tce_cache_blast(tbl);
+		offset = find_next_zero_string(tbl->it_map, 0,
+					       tbl->it_size, npages);
+		if (offset == ~0UL) {
+			printk(KERN_WARNING "Calgary: IOMMU full.\n");
+			if (panic_on_overflow)
+				panic("Calgary: fix the allocator.\n");
+			else
+				return bad_dma_address;
+		}
+	}
+
+	set_bit_string(tbl->it_map, offset, npages);
+	tbl->it_hint = offset + npages;
+	BUG_ON(tbl->it_hint > tbl->it_size);
+
+	return offset;
+}
+
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
+	unsigned int npages, int direction)
+{
+	unsigned long entry, flags;
+	dma_addr_t ret = bad_dma_address;
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	entry = iommu_range_alloc(tbl, npages);
+
+	if (unlikely(entry == bad_dma_address))
+		goto error;
+
+	/* set the return dma address */
+	ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+
+	/* put the TCEs in the HW table */
+	tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+		  direction);
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+	return ret;
+
+error:
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+	printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+	       "iommu %p\n", npages, tbl);
+	return bad_dma_address;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+	unsigned int npages)
+{
+	unsigned long entry;
+	unsigned long i;
+
+	entry = dma_addr >> PAGE_SHIFT;
+
+	BUG_ON(entry + npages > tbl->it_size);
+
+	tce_free(tbl, entry, npages);
+
+	for (i = 0; i < npages; ++i) {
+		if (!test_bit(entry + i, tbl->it_map))
+			printk(KERN_ERR "Calgary: bit is off at 0x%lx "
+			       "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
+			       entry + i, tbl, dma_addr, entry, npages);
+	}
+
+	__clear_bit_string(tbl->it_map, entry, npages);
+
+	tce_cache_blast_stress(tbl);
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+	unsigned int npages)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	__iommu_free(tbl, dma_addr, npages);
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static void __calgary_unmap_sg(struct iommu_table *tbl,
+	struct scatterlist *sglist, int nelems, int direction)
+{
+	while (nelems--) {
+		unsigned int npages;
+		dma_addr_t dma = sglist->dma_address;
+		unsigned int dmalen = sglist->dma_length;
+
+		if (dmalen == 0)
+			break;
+
+		npages = num_dma_pages(dma, dmalen);
+		__iommu_free(tbl, dma, npages);
+		sglist++;
+	}
+}
+
+void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+		      int nelems, int direction)
+{
+	unsigned long flags;
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+	if (!translate_phb(to_pci_dev(dev)))
+		return;
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	__calgary_unmap_sg(tbl, sglist, nelems, direction);
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static int calgary_nontranslate_map_sg(struct device* dev,
+	struct scatterlist *sg, int nelems, int direction)
+{
+	int i;
+
+ 	for (i = 0; i < nelems; i++ ) {
+		struct scatterlist *s = &sg[i];
+		BUG_ON(!s->page);
+		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+		s->dma_length = s->length;
+	}
+	return nelems;
+}
+
+int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+	int nelems, int direction)
+{
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+	unsigned long flags;
+	unsigned long vaddr;
+	unsigned int npages;
+	unsigned long entry;
+	int i;
+
+	if (!translate_phb(to_pci_dev(dev)))
+		return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
+
+	spin_lock_irqsave(&tbl->it_lock, flags);
+
+	for (i = 0; i < nelems; i++ ) {
+		struct scatterlist *s = &sg[i];
+		BUG_ON(!s->page);
+
+		vaddr = (unsigned long)page_address(s->page) + s->offset;
+		npages = num_dma_pages(vaddr, s->length);
+
+		entry = iommu_range_alloc(tbl, npages);
+		if (entry == bad_dma_address) {
+			/* makes sure unmap knows to stop */
+			s->dma_length = 0;
+			goto error;
+		}
+
+		s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+
+		/* insert into HW table */
+		tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
+			  direction);
+
+		s->dma_length = s->length;
+	}
+
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+	return nelems;
+error:
+	__calgary_unmap_sg(tbl, sg, nelems, direction);
+	for (i = 0; i < nelems; i++) {
+		sg[i].dma_address = bad_dma_address;
+		sg[i].dma_length = 0;
+	}
+	spin_unlock_irqrestore(&tbl->it_lock, flags);
+	return 0;
+}
+
+dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+	size_t size, int direction)
+{
+	dma_addr_t dma_handle = bad_dma_address;
+	unsigned long uaddr;
+	unsigned int npages;
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+	uaddr = (unsigned long)vaddr;
+	npages = num_dma_pages(uaddr, size);
+
+	if (translate_phb(to_pci_dev(dev)))
+		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+	else
+		dma_handle = virt_to_bus(vaddr);
+
+	return dma_handle;
+}
+
+void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
+	size_t size, int direction)
+{
+	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+	unsigned int npages;
+
+	if (!translate_phb(to_pci_dev(dev)))
+		return;
+
+	npages = num_dma_pages(dma_handle, size);
+	iommu_free(tbl, dma_handle, npages);
+}
+
+void* calgary_alloc_coherent(struct device *dev, size_t size,
+	dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret = NULL;
+	dma_addr_t mapping;
+	unsigned int npages, order;
+	struct iommu_table *tbl;
+
+	tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+	size = PAGE_ALIGN(size); /* size rounded up to full pages */
+	npages = size >> PAGE_SHIFT;
+	order = get_order(size);
+
+	/* alloc enough pages (and possibly more) */
+	ret = (void *)__get_free_pages(flag, order);
+	if (!ret)
+		goto error;
+	memset(ret, 0, size);
+
+	if (translate_phb(to_pci_dev(dev))) {
+		/* set up tces to cover the allocated range */
+		mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+		if (mapping == bad_dma_address)
+			goto free;
+
+		*dma_handle = mapping;
+	} else /* non translated slot */
+		*dma_handle = virt_to_bus(ret);
+
+	return ret;
+
+free:
+	free_pages((unsigned long)ret, get_order(size));
+	ret = NULL;
+error:
+	return ret;
+}
+
+static struct dma_mapping_ops calgary_dma_ops = {
+	.alloc_coherent = calgary_alloc_coherent,
+	.map_single = calgary_map_single,
+	.unmap_single = calgary_unmap_single,
+	.map_sg = calgary_map_sg,
+	.unmap_sg = calgary_unmap_sg,
+};
+
+static inline int busno_to_phbid(unsigned char num)
+{
+	return bus_to_phb(num) % PHBS_PER_CALGARY;
+}
+
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+	size_t idx = busno_to_phbid(num);
+
+	return split_queue_offsets[idx];
+}
+
+static inline unsigned long tar_offset(unsigned char num)
+{
+	size_t idx = busno_to_phbid(num);
+
+	return tar_offsets[idx];
+}
+
+static inline unsigned long phb_offset(unsigned char num)
+{
+	size_t idx = busno_to_phbid(num);
+
+	return phb_offsets[idx];
+}
+
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+	unsigned long target = ((unsigned long)bar) | offset;
+	return (void __iomem*)target;
+}
+
+static void tce_cache_blast(struct iommu_table *tbl)
+{
+	u64 val;
+	u32 aer;
+	int i = 0;
+	void __iomem *bbar = tbl->bbar;
+	void __iomem *target;
+
+	/* disable arbitration on the bus */
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+	aer = readl(target);
+	writel(0, target);
+
+	/* read plssr to ensure it got there */
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+	val = readl(target);
+
+	/* poll split queues until all DMA activity is done */
+	target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+	do {
+		val = readq(target);
+		i++;
+	} while ((val & 0xff) != 0xff && i < 100);
+	if (i == 100)
+		printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
+		       "continuing anyway\n");
+
+	/* invalidate TCE cache */
+	target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+	writeq(tbl->tar_val, target);
+
+	/* enable arbitration */
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+	writel(aer, target);
+	(void)readl(target); /* flush */
+}
+
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+	u64 limit)
+{
+	unsigned int numpages;
+
+	limit = limit | 0xfffff;
+	limit++;
+
+	numpages = ((limit - start) >> PAGE_SHIFT);
+	iommu_range_reserve(dev->sysdata, start, numpages);
+}
+
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+	void __iomem *target;
+	u64 low, high, sizelow;
+	u64 start, limit;
+	struct iommu_table *tbl = dev->sysdata;
+	unsigned char busnum = dev->bus->number;
+	void __iomem *bbar = tbl->bbar;
+
+	/* peripheral MEM_1 region */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+	low = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+	high = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+	sizelow = be32_to_cpu(readl(target));
+
+	start = (high << 32) | low;
+	limit = sizelow;
+
+	calgary_reserve_mem_region(dev, start, limit);
+}
+
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+	void __iomem *target;
+	u32 val32;
+	u64 low, high, sizelow, sizehigh;
+	u64 start, limit;
+	struct iommu_table *tbl = dev->sysdata;
+	unsigned char busnum = dev->bus->number;
+	void __iomem *bbar = tbl->bbar;
+
+	/* is it enabled? */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+	if (!(val32 & PHB_MEM2_ENABLE))
+		return;
+
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+	low = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+	high = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+	sizelow = be32_to_cpu(readl(target));
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+	sizehigh = be32_to_cpu(readl(target));
+
+	start = (high << 32) | low;
+	limit = (sizehigh << 32) | sizelow;
+
+	calgary_reserve_mem_region(dev, start, limit);
+}
+
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+	unsigned int npages;
+	void __iomem *bbar;
+	unsigned char busnum;
+	u64 start;
+	struct iommu_table *tbl = dev->sysdata;
+
+	bbar = tbl->bbar;
+	busnum = dev->bus->number;
+
+	/* reserve bad_dma_address in case it's a legal address */
+	iommu_range_reserve(tbl, bad_dma_address, 1);
+
+	/* avoid the BIOS/VGA first 640KB-1MB region */
+	start = (640 * 1024);
+	npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+	iommu_range_reserve(tbl, start, npages);
+
+	/* reserve the two PCI peripheral memory regions in IO space */
+	calgary_reserve_peripheral_mem_1(dev);
+	calgary_reserve_peripheral_mem_2(dev);
+}
+
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+	u64 val64;
+	u64 table_phys;
+	void __iomem *target;
+	int ret;
+	struct iommu_table *tbl;
+
+	/* build TCE tables for each PHB */
+	ret = build_tce_table(dev, bbar);
+	if (ret)
+		return ret;
+
+	calgary_reserve_regions(dev);
+
+	/* set TARs for each PHB */
+	target = calgary_reg(bbar, tar_offset(dev->bus->number));
+	val64 = be64_to_cpu(readq(target));
+
+	/* zero out all TAR bits under sw control */
+	val64 &= ~TAR_SW_BITS;
+
+	tbl = dev->sysdata;
+	table_phys = (u64)__pa(tbl->it_base);
+	val64 |= table_phys;
+
+	BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+	val64 |= (u64) specified_table_size;
+
+	tbl->tar_val = cpu_to_be64(val64);
+	writeq(tbl->tar_val, target);
+	readq(target); /* flush */
+
+	return 0;
+}
+
+static void __init calgary_free_tar(struct pci_dev *dev)
+{
+	u64 val64;
+	struct iommu_table *tbl = dev->sysdata;
+	void __iomem *target;
+
+	target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+	val64 = be64_to_cpu(readq(target));
+	val64 &= ~TAR_SW_BITS;
+	writeq(cpu_to_be64(val64), target);
+	readq(target); /* flush */
+
+	kfree(tbl);
+	dev->sysdata = NULL;
+}
+
+static void calgary_watchdog(unsigned long data)
+{
+	struct pci_dev *dev = (struct pci_dev *)data;
+	struct iommu_table *tbl = dev->sysdata;
+	void __iomem *bbar = tbl->bbar;
+	u32 val32;
+	void __iomem *target;
+
+	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+
+	/* If no error, the agent ID in the CSR is not valid */
+	if (val32 & CSR_AGENT_MASK) {
+		printk(KERN_EMERG "calgary_watchdog: DMA error on bus %d, "
+				  "CSR = %#x\n", dev->bus->number, val32);
+		writel(0, target);
+
+		/* Disable bus that caused the error */
+		target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+					   PHB_CONFIG_RW_OFFSET);
+		val32 = be32_to_cpu(readl(target));
+		val32 |= PHB_SLOT_DISABLE;
+		writel(cpu_to_be32(val32), target);
+		readl(target); /* flush */
+	} else {
+		/* Reset the timer */
+		mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+	}
+}
+
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+	u32 val32;
+	unsigned char busnum;
+	void __iomem *target;
+	void __iomem *bbar;
+	struct iommu_table *tbl;
+
+	busnum = dev->bus->number;
+	tbl = dev->sysdata;
+	bbar = tbl->bbar;
+
+	/* enable TCE in PHB Config Register */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+	val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+
+	printk(KERN_INFO "Calgary: enabling translation on PHB %d\n", busnum);
+	printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+	       "bus.\n");
+
+	writel(cpu_to_be32(val32), target);
+	readl(target); /* flush */
+
+	init_timer(&tbl->watchdog_timer);
+	tbl->watchdog_timer.function = &calgary_watchdog;
+	tbl->watchdog_timer.data = (unsigned long)dev;
+	mod_timer(&tbl->watchdog_timer, jiffies);
+}
+
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+	u32 val32;
+	unsigned char busnum;
+	void __iomem *target;
+	void __iomem *bbar;
+	struct iommu_table *tbl;
+
+	busnum = dev->bus->number;
+	tbl = dev->sysdata;
+	bbar = tbl->bbar;
+
+	/* disable TCE in PHB Config Register */
+	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+	val32 = be32_to_cpu(readl(target));
+	val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+
+	printk(KERN_INFO "Calgary: disabling translation on PHB %d!\n", busnum);
+	writel(cpu_to_be32(val32), target);
+	readl(target); /* flush */
+
+	del_timer_sync(&tbl->watchdog_timer);
+}
+
+static inline unsigned int __init locate_register_space(struct pci_dev *dev)
+{
+	int rionodeid;
+	u32 address;
+
+	rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2;
+	/*
+	 * register space address calculation as follows:
+	 * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase)
+	 * ChassisBase is always zero for x366/x260/x460
+	 * RioNodeId is 2 for first Calgary, 3 for second Calgary
+	 */
+	address = START_ADDRESS	-
+		(0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) +
+		(0x100000) * (rionodeid - CHASSIS_BASE);
+	return address;
+}
+
+static int __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+	dev->sysdata = NULL;
+	dev->bus->self = dev;
+
+	return 0;
+}
+
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+	u32 address;
+	void __iomem *bbar;
+	int ret;
+
+	address = locate_register_space(dev);
+	/* map entire 1MB of Calgary config space */
+	bbar = ioremap_nocache(address, 1024 * 1024);
+	if (!bbar) {
+		ret = -ENODATA;
+		goto done;
+	}
+
+	ret = calgary_setup_tar(dev, bbar);
+	if (ret)
+		goto iounmap;
+
+	dev->bus->self = dev;
+	calgary_enable_translation(dev);
+
+	return 0;
+
+iounmap:
+	iounmap(bbar);
+done:
+	return ret;
+}
+
+static int __init calgary_init(void)
+{
+	int i, ret = -ENODEV;
+	struct pci_dev *dev = NULL;
+
+	for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) {
+		dev = pci_get_device(PCI_VENDOR_ID_IBM,
+				     PCI_DEVICE_ID_IBM_CALGARY,
+				     dev);
+		if (!dev)
+			break;
+		if (!translate_phb(dev)) {
+			calgary_init_one_nontraslated(dev);
+			continue;
+		}
+		if (!tce_table_kva[i] && !translate_empty_slots) {
+			pci_dev_put(dev);
+			continue;
+		}
+		ret = calgary_init_one(dev);
+		if (ret)
+			goto error;
+	}
+
+	return ret;
+
+error:
+	for (i--; i >= 0; i--) {
+		dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
+					      PCI_DEVICE_ID_IBM_CALGARY,
+					      dev);
+		if (!translate_phb(dev)) {
+			pci_dev_put(dev);
+			continue;
+		}
+		if (!tce_table_kva[i] && !translate_empty_slots)
+			continue;
+		calgary_disable_translation(dev);
+		calgary_free_tar(dev);
+		pci_dev_put(dev);
+	}
+
+	return ret;
+}
+
+static inline int __init determine_tce_table_size(u64 ram)
+{
+	int ret;
+
+	if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+		return specified_table_size;
+
+	/*
+	 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+	 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+	 * larger table size has twice as many entries, so shift the
+	 * max ram address by 13 to divide by 8K and then look at the
+	 * order of the result to choose between 0-7.
+	 */
+	ret = get_order(ram >> 13);
+	if (ret > TCE_TABLE_SIZE_8M)
+		ret = TCE_TABLE_SIZE_8M;
+
+	return ret;
+}
+
+void __init detect_calgary(void)
+{
+	u32 val;
+	int bus, table_idx;
+	void *tbl;
+	int detected = 0;
+
+	/*
+	 * if the user specified iommu=off or iommu=soft or we found
+	 * another HW IOMMU already, bail out.
+	 */
+	if (swiotlb || no_iommu || iommu_detected)
+		return;
+
+	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+
+	for (bus = 0, table_idx = 0;
+	     bus <= num_online_nodes() * MAX_PHB_BUS_NUM;
+	     bus++) {
+		BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM);
+		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
+			continue;
+		if (test_bit(bus, translation_disabled)) {
+			printk(KERN_INFO "Calgary: translation is disabled for "
+			       "PHB 0x%x\n", bus);
+			/* skip this phb, don't allocate a tbl for it */
+			tce_table_kva[table_idx] = NULL;
+			table_idx++;
+			continue;
+		}
+		/*
+		 * scan the first slot of the PCI bus to see if there
+		 * are any devices present
+		 */
+		val = read_pci_config(bus, 1, 0, 0);
+		if (val != 0xffffffff || translate_empty_slots) {
+			tbl = alloc_tce_table();
+			if (!tbl)
+				goto cleanup;
+			detected = 1;
+		} else
+			tbl = NULL;
+
+		tce_table_kva[table_idx] = tbl;
+		table_idx++;
+	}
+
+	if (detected) {
+		iommu_detected = 1;
+		calgary_detected = 1;
+		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
+		       "TCE table spec is %d.\n", specified_table_size);
+	}
+	return;
+
+cleanup:
+	for (--table_idx; table_idx >= 0; --table_idx)
+		if (tce_table_kva[table_idx])
+			free_tce_table(tce_table_kva[table_idx]);
+}
+
+int __init calgary_iommu_init(void)
+{
+	int ret;
+
+	if (no_iommu || swiotlb)
+		return -ENODEV;
+
+	if (!calgary_detected)
+		return -ENODEV;
+
+	/* ok, we're trying to use Calgary - let's roll */
+	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+	ret = calgary_init();
+	if (ret) {
+		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+		       "falling back to no_iommu\n", ret);
+		if (end_pfn > MAX_DMA32_PFN)
+			printk(KERN_ERR "WARNING more than 4GB of memory, "
+					"32bit PCI may malfunction.\n");
+		return ret;
+	}
+
+	force_iommu = 1;
+	dma_ops = &calgary_dma_ops;
+
+	return 0;
+}
+
+static int __init calgary_parse_options(char *p)
+{
+	unsigned int bridge;
+	size_t len;
+	char* endp;
+
+	while (*p) {
+		if (!strncmp(p, "64k", 3))
+			specified_table_size = TCE_TABLE_SIZE_64K;
+		else if (!strncmp(p, "128k", 4))
+			specified_table_size = TCE_TABLE_SIZE_128K;
+		else if (!strncmp(p, "256k", 4))
+			specified_table_size = TCE_TABLE_SIZE_256K;
+		else if (!strncmp(p, "512k", 4))
+			specified_table_size = TCE_TABLE_SIZE_512K;
+		else if (!strncmp(p, "1M", 2))
+			specified_table_size = TCE_TABLE_SIZE_1M;
+		else if (!strncmp(p, "2M", 2))
+			specified_table_size = TCE_TABLE_SIZE_2M;
+		else if (!strncmp(p, "4M", 2))
+			specified_table_size = TCE_TABLE_SIZE_4M;
+		else if (!strncmp(p, "8M", 2))
+			specified_table_size = TCE_TABLE_SIZE_8M;
+
+		len = strlen("translate_empty_slots");
+		if (!strncmp(p, "translate_empty_slots", len))
+			translate_empty_slots = 1;
+
+		len = strlen("disable");
+		if (!strncmp(p, "disable", len)) {
+			p += len;
+			if (*p == '=')
+				++p;
+			if (*p == '\0')
+				break;
+			bridge = simple_strtol(p, &endp, 0);
+			if (p == endp)
+				break;
+
+			if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) {
+				printk(KERN_INFO "Calgary: disabling "
+				       "translation for PHB 0x%x\n", bridge);
+				set_bit(bridge, translation_disabled);
+			}
+		}
+
+		p = strpbrk(p, ",");
+		if (!p)
+			break;
+
+		p++; /* skip ',' */
+	}
+	return 1;
+}
+__setup("calgary=", calgary_parse_options);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index a45844c7e3a3..9c44f4f2433d 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <asm/io.h>
 #include <asm/proto.h>
+#include <asm/calgary.h>
 
 int iommu_merge __read_mostly = 0;
 EXPORT_SYMBOL(iommu_merge);
@@ -291,6 +292,10 @@ void __init pci_iommu_alloc(void)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_CALGARY_IOMMU
+	detect_calgary();
+#endif
+
 #ifdef CONFIG_SWIOTLB
 	pci_swiotlb_init();
 #endif
@@ -298,6 +303,10 @@ void __init pci_iommu_alloc(void)
 
 static int __init pci_iommu_init(void)
 {
+#ifdef CONFIG_CALGARY_IOMMU
+	calgary_iommu_init();
+#endif
+
 #ifdef CONFIG_IOMMU
 	gart_iommu_init();
 #endif
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
new file mode 100644
index 000000000000..8d4c67f61b8e
--- /dev/null
+++ b/arch/x86_64/kernel/tce.c
@@ -0,0 +1,202 @@
+/*
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
+ * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+	/* a single tce can't cross a cache line */
+	if (cpu_has_clflush)
+		asm volatile("clflush (%0)" :: "r" (tceaddr));
+	else
+		asm volatile("wbinvd":::"memory");
+}
+
+void tce_build(struct iommu_table *tbl, unsigned long index,
+	unsigned int npages, unsigned long uaddr, int direction)
+{
+	u64* tp;
+	u64 t;
+	u64 rpn;
+
+	t = (1 << TCE_READ_SHIFT);
+	if (direction != DMA_TO_DEVICE)
+		t |= (1 << TCE_WRITE_SHIFT);
+
+	tp = ((u64*)tbl->it_base) + index;
+
+	while (npages--) {
+		rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+		t &= ~TCE_RPN_MASK;
+		t |= (rpn << TCE_RPN_SHIFT);
+
+		*tp = cpu_to_be64(t);
+		flush_tce(tp);
+
+		uaddr += PAGE_SIZE;
+		tp++;
+	}
+}
+
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+	u64* tp;
+
+	tp  = ((u64*)tbl->it_base) + index;
+
+	while (npages--) {
+		*tp = cpu_to_be64(0);
+		flush_tce(tp);
+		tp++;
+	}
+}
+
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+	/*
+	 * size is the order of the table, 0-7
+	 * smallest table is 8K entries, so shift result by 13 to
+	 * multiply by 8K
+	 */
+	return (1 << size) << 13;
+}
+
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+	unsigned int bitmapsz;
+	unsigned int tce_table_index;
+	unsigned long bmppages;
+	int ret;
+
+	tbl->it_busno = dev->bus->number;
+
+	/* set the tce table size - measured in entries */
+	tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+
+	tce_table_index = bus_to_phb(tbl->it_busno);
+	tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
+	if (!tbl->it_base) {
+		printk(KERN_ERR "Calgary: iommu_table_setparms: "
+		       "no table allocated?!\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * number of bytes needed for the bitmap size in number of
+	 * entries; we need one bit per entry
+	 */
+	bitmapsz = tbl->it_size / BITS_PER_BYTE;
+	bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+	if (!bmppages) {
+		printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	tbl->it_map = (unsigned long*)bmppages;
+
+	memset(tbl->it_map, 0, bitmapsz);
+
+	tbl->it_hint = 0;
+
+	spin_lock_init(&tbl->it_lock);
+
+	return 0;
+
+done:
+	return ret;
+}
+
+int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+	struct iommu_table *tbl;
+	int ret;
+
+	if (dev->sysdata) {
+		printk(KERN_ERR "Calgary: dev %p has sysdata %p\n",
+		       dev, dev->sysdata);
+		BUG();
+	}
+
+	tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+	if (!tbl) {
+		printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	ret = tce_table_setparms(dev, tbl);
+	if (ret)
+		goto free_tbl;
+
+	tce_free(tbl, 0, tbl->it_size);
+
+	tbl->bbar = bbar;
+
+	/*
+	 * NUMA is already using the bus's sysdata pointer, so we use
+	 * the bus's pci_dev's sysdata instead.
+	 */
+	dev->sysdata = tbl;
+
+	return 0;
+
+free_tbl:
+	kfree(tbl);
+done:
+	return ret;
+}
+
+void* alloc_tce_table(void)
+{
+	unsigned int size;
+
+	size = table_size_to_number_of_entries(specified_table_size);
+	size *= TCE_ENTRY_SIZE;
+
+	return __alloc_bootmem_low(size, size, 0);
+}
+
+void free_tce_table(void *tbl)
+{
+	unsigned int size;
+
+	if (!tbl)
+		return;
+
+	size = table_size_to_number_of_entries(specified_table_size);
+	size *= TCE_ENTRY_SIZE;
+
+	free_bootmem(__pa(tbl), size);
+}
-- 
cgit v1.2.3


From f3fa8ebc25129bb69929e20b0c84049c39029d8d Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohitseth@google.com>
Date: Mon, 26 Jun 2006 13:58:17 +0200
Subject: [PATCH] x86_64: moving phys_proc_id and cpu_core_id to cpuinfo_x86

Most of the fields of cpuinfo are defined in cpuinfo_x86 structure.
This patch moves the phys_proc_id and cpu_core_id for each processor to
cpuinfo_x86 structure as well.

Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c  | 10 +++++-----
 arch/x86_64/kernel/pci-gart.c |  3 ---
 arch/x86_64/kernel/setup.c    | 27 +++++++++++++--------------
 arch/x86_64/kernel/smpboot.c  | 14 +++++---------
 4 files changed, 23 insertions(+), 31 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d13b241ad094..86e1e022b20e 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -115,7 +115,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 		per_cpu(bank_map, cpu) |= (1 << bank);
 
 #ifdef CONFIG_SMP
-		if (shared_bank[bank] && cpu_core_id[cpu])
+		if (shared_bank[bank] && c->cpu_core_id)
 			continue;
 #endif
 
@@ -323,10 +323,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
 	struct threshold_bank *b = NULL;
 
 #ifdef CONFIG_SMP
-	if (cpu_core_id[cpu] && shared_bank[bank]) {	/* symlink */
+	if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {	/* symlink */
 		char name[16];
 		unsigned lcpu = first_cpu(cpu_core_map[cpu]);
-		if (cpu_core_id[lcpu])
+		if (cpu_data[lcpu].cpu_core_id)
 			goto out;	/* first core not up yet */
 
 		b = per_cpu(threshold_banks, lcpu)[bank];
@@ -434,7 +434,7 @@ static __cpuinit int threshold_create_symlinks(unsigned int cpu)
 	int bank, err = 0;
 	unsigned int lcpu = 0;
 
-	if (cpu_core_id[cpu])
+	if (cpu_data[cpu].cpu_core_id)
 		return 0;
 	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
 		if (lcpu == cpu)
@@ -455,7 +455,7 @@ static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
 {
 	int bank;
 	unsigned int lcpu = 0;
-	if (cpu_core_id[cpu])
+	if (cpu_data[cpu].cpu_core_id)
 		return;
 	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
 		if (lcpu == cpu)
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 9a93954bed37..4ca674d16b09 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -680,9 +680,6 @@ void __init gart_iommu_init(void)
 	dma_ops = &gart_dma_ops;
 } 
 
-/* Must execute after PCI subsystem */
-fs_initcall(gart_iommu_init);
-
 void gart_parse_options(char *p)
 {
 	int arg;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 04b2d7b92d17..24aa25ee0d7d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -785,9 +785,9 @@ static int nearby_node(int apicid)
 static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	int cpu = smp_processor_id();
 	unsigned bits;
 #ifdef CONFIG_NUMA
+	int cpu = smp_processor_id();
 	int node = 0;
 	unsigned apicid = hard_smp_processor_id();
 #endif
@@ -805,12 +805,12 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	}
 
 	/* Low order bits define the core id (index of core in socket) */
-	cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
+	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
 	/* Convert the APIC ID into the socket ID */
-	phys_proc_id[cpu] = phys_pkg_id(bits);
+	c->phys_proc_id = phys_pkg_id(bits);
 
 #ifdef CONFIG_NUMA
-  	node = phys_proc_id[cpu];
+  	node = c->phys_proc_id;
  	if (apicid_to_node[apicid] != NUMA_NO_NODE)
  		node = apicid_to_node[apicid];
  	if (!node_online(node)) {
@@ -823,7 +823,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
  		   but in the same order as the HT nodeids.
  		   If that doesn't result in a usable node fall back to the
  		   path for the previous case.  */
- 		int ht_nodeid = apicid - (phys_proc_id[0] << bits);
+ 		int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
  		if (ht_nodeid >= 0 &&
  		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
  			node = apicid_to_node[ht_nodeid];
@@ -834,7 +834,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	numa_set_node(cpu, node);
 
   	printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n",
-  			cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]);
+  			cpu, apicid, c->x86_max_cores, node, c->cpu_core_id);
 #endif
 #endif
 }
@@ -905,7 +905,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 #ifdef CONFIG_SMP
 	u32 	eax, ebx, ecx, edx;
 	int 	index_msb, core_bits;
-	int 	cpu = smp_processor_id();
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
@@ -926,10 +925,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 		}
 
 		index_msb = get_count_order(smp_num_siblings);
-		phys_proc_id[cpu] = phys_pkg_id(index_msb);
+		c->phys_proc_id = phys_pkg_id(index_msb);
 
 		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       phys_proc_id[cpu]);
+		       c->phys_proc_id);
 
 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
@@ -937,12 +936,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 		core_bits = get_count_order(c->x86_max_cores);
 
-		cpu_core_id[cpu] = phys_pkg_id(index_msb) &
+		c->cpu_core_id = phys_pkg_id(index_msb) &
 					       ((1 << core_bits) - 1);
 
 		if (c->x86_max_cores > 1)
 			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       cpu_core_id[cpu]);
+			       c->cpu_core_id);
 	}
 #endif
 }
@@ -1080,7 +1079,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 #ifdef CONFIG_SMP
-	phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
 }
 
@@ -1288,9 +1287,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 #ifdef CONFIG_SMP
 	if (smp_num_siblings * c->x86_max_cores > 1) {
 		int cpu = c - cpu_data;
-		seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
+		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
-		seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
+		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	}
 #endif	
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 06535e7687ce..b1c10b154bfe 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -63,10 +63,6 @@
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
-/* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-/* core ID of each logical CPU */
-u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 /* Last level cache ID of each logical CPU */
 u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
@@ -472,8 +468,8 @@ static inline void set_cpu_sibling_map(int cpu)
 
 	if (smp_num_siblings > 1) {
 		for_each_cpu_mask(i, cpu_sibling_setup_map) {
-			if (phys_proc_id[cpu] == phys_proc_id[i] &&
-			    cpu_core_id[cpu] == cpu_core_id[i]) {
+			if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+			    c[cpu].cpu_core_id == c[i].cpu_core_id) {
 				cpu_set(i, cpu_sibling_map[cpu]);
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
@@ -500,7 +496,7 @@ static inline void set_cpu_sibling_map(int cpu)
 			cpu_set(i, c[cpu].llc_shared_map);
 			cpu_set(cpu, c[i].llc_shared_map);
 		}
-		if (phys_proc_id[cpu] == phys_proc_id[i]) {
+		if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
 			/*
@@ -1201,8 +1197,8 @@ static void remove_siblinginfo(int cpu)
 		cpu_clear(cpu, cpu_sibling_map[sibling]);
 	cpus_clear(cpu_sibling_map[cpu]);
 	cpus_clear(cpu_core_map[cpu]);
-	phys_proc_id[cpu] = BAD_APICID;
-	cpu_core_id[cpu] = BAD_APICID;
+	c[cpu].phys_proc_id = 0;
+	c[cpu].cpu_core_id = 0;
 	cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
-- 
cgit v1.2.3


From f8bf3c65a962530821fa9a9b4bad43d8c7e13574 Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:23 +0200
Subject: [PATCH] x86_64: Rename oem_force_hpet_timer to apic_is_clustered_box

Rename oem_force_hpet_timer to apic_is_clustered_box, to give the
function a better fitting name - it really isn't at all about HPET.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c | 7 +++----
 arch/x86_64/kernel/time.c | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 29ef99001e05..53fc17d894e8 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -983,7 +983,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
 }
 
 /*
- * oem_force_hpet_timer -- force HPET mode for some boxes.
+ * apic_is_clustered_box() -- Check if we can expect good TSC
  *
  * Thus far, the major user of this is IBM's Summit2 series:
  *
@@ -991,7 +991,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
  * multi-chassis. Use available data to take a good guess.
  * If in doubt, go HPET.
  */
-__cpuinit int oem_force_hpet_timer(void)
+__cpuinit int apic_is_clustered_box(void)
 {
 	int i, clusters, zeros;
 	unsigned id;
@@ -1022,8 +1022,7 @@ __cpuinit int oem_force_hpet_timer(void)
 	}
 
 	/*
-	 * If clusters > 2, then should be multi-chassis.  Return 1 for HPET.
-	 * Else return 0 to use TSC.
+	 * If clusters > 2, then should be multi-chassis.
 	 * May have to revisit this when multi-core + hyperthreaded CPUs come
 	 * out, but AFAIK this will work even for them.
 	 */
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7392570f975d..54b14e3ef66f 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -956,10 +956,10 @@ void __init time_init(void)
 __cpuinit int unsynchronized_tsc(void)
 {
 #ifdef CONFIG_SMP
-	if (oem_force_hpet_timer())
+	if (apic_is_clustered_box())
 		return 1;
  	/* Intel systems are normally all synchronized. Exceptions
- 	   are handled in the OEM check above. */
+ 	   are handled in the check above. */
  	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
  		return 0;
 #endif
-- 
cgit v1.2.3


From 7b0e85012581a9a8b12a8191d365716bf3ca3d3c Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:26 +0200
Subject: [PATCH] x86_64: Add X86_FEATURE_RDTSCP, fix rdtscp in /proc/cpuinfo

This patch adds the X86_FEATURE_RDTSCP #define, so that kernel code can
check for the feature easily and also fixes the location of the "rdtscp"
string in the cpuinfo tables.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 24aa25ee0d7d..923b76fb0aa6 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1206,7 +1206,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
+		NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
 
 		/* Transmeta-defined */
 		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-- 
cgit v1.2.3


From e30db3e69948dafb616d121e52c8d62d38dab68c Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:29 +0200
Subject: [PATCH] x86_64: Remove hack to manually enable HPET on AMD8111
 southbridges

Remove #ifdefed code to manually enable HPET on AMD8111, where the
BIOS doesn't have ACPI HPET tables and doesn't enable it for us.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 54b14e3ef66f..ab9bea82945f 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -56,8 +56,6 @@ DEFINE_SPINLOCK(i8253_lock);
 int nohpet __initdata = 0;
 static int notsc __initdata = 0;
 
-#undef HPET_HACK_ENABLE_DANGEROUS
-
 unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
@@ -890,18 +888,6 @@ void __init time_init(void)
 	char *timename;
 	char *gtod;
 
-#ifdef HPET_HACK_ENABLE_DANGEROUS
-        if (!vxtime.hpet_address) {
-		printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
-		       "manually!\n");
-                outl(0x800038a0, 0xcf8);
-                outl(0xff000001, 0xcfc);
-                outl(0x800038a0, 0xcf8);
-                vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
-		printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
-		       "at %#lx.\n", vxtime.hpet_address);
-        }
-#endif
 	if (nohpet)
 		vxtime.hpet_address = 0;
 
-- 
cgit v1.2.3


From 4221133845f81ab4428c79a89e37be2c87624c1a Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:32 +0200
Subject: [PATCH] x86_64: Make use of the *PER* constants in time.c

This patch makes use of the newly added conversion constants
in time.h to x86-64 time.c. The code gets significantly easier
to understand.

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 63 ++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index ab9bea82945f..51afb07bc14e 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -56,6 +56,13 @@ DEFINE_SPINLOCK(i8253_lock);
 int nohpet __initdata = 0;
 static int notsc __initdata = 0;
 
+#define USEC_PER_TICK (USEC_PER_SEC / HZ)
+#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
+#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
+
+#define NS_SCALE	10 /* 2^10, carefully chosen */
+#define US_SCALE	32 /* 2^32, arbitralrily chosen */
+
 unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
@@ -88,7 +95,7 @@ static inline unsigned int do_gettimeoffset_tsc(void)
 	t = get_cycles_sync();
 	if (t < vxtime.last_tsc) 
 		t = vxtime.last_tsc; /* hack */
-	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
 	return x;
 }
 
@@ -96,7 +103,7 @@ static inline unsigned int do_gettimeoffset_hpet(void)
 {
 	/* cap counter read to one tick to avoid inconsistencies */
 	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
-	return (min(counter,hpet_tick) * vxtime.quot) >> 32;
+	return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
 }
 
 unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -116,7 +123,7 @@ void do_gettimeofday(struct timeval *tv)
 		seq = read_seqbegin(&xtime_lock);
 
 		sec = xtime.tv_sec;
-		usec = xtime.tv_nsec / 1000;
+		usec = xtime.tv_nsec / NSEC_PER_USEC;
 
 		/* i386 does some correction here to keep the clock 
 		   monotonous even when ntpd is fixing drift.
@@ -127,14 +134,14 @@ void do_gettimeofday(struct timeval *tv)
 		   in arch/x86_64/kernel/vsyscall.c and export all needed
 		   variables in vmlinux.lds. -AK */ 
 
-		t = (jiffies - wall_jiffies) * (1000000L / HZ) +
+		t = (jiffies - wall_jiffies) * USEC_PER_TICK +
 			do_gettimeoffset();
 		usec += t;
 
 	} while (read_seqretry(&xtime_lock, seq));
 
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
+	tv->tv_sec = sec + usec / USEC_PER_SEC;
+	tv->tv_usec = usec % USEC_PER_SEC;
 }
 
 EXPORT_SYMBOL(do_gettimeofday);
@@ -155,8 +162,8 @@ int do_settimeofday(struct timespec *tv)
 
 	write_seqlock_irq(&xtime_lock);
 
-	nsec -= do_gettimeoffset() * 1000 +
-		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+	nsec -= do_gettimeoffset() * NSEC_PER_USEC +
+		(jiffies - wall_jiffies) * NSEC_PER_TICK;
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -286,7 +293,7 @@ unsigned long long monotonic_clock(void)
 			this_offset = hpet_readl(HPET_COUNTER);
 		} while (read_seqretry(&xtime_lock, seq));
 		offset = (this_offset - last_offset);
-		offset *= (NSEC_PER_SEC/HZ) / hpet_tick;
+		offset *= NSEC_PER_TICK / hpet_tick;
 	} else {
 		do {
 			seq = read_seqbegin(&xtime_lock);
@@ -295,7 +302,8 @@ unsigned long long monotonic_clock(void)
 			base = monotonic_base;
 		} while (read_seqretry(&xtime_lock, seq));
 		this_offset = get_cycles_sync();
-		offset = (this_offset - last_offset)*1000 / cpu_khz; 
+		/* FIXME: 1000 or 1000000? */
+		offset = (this_offset - last_offset)*1000 / cpu_khz;
 	}
 	return base + offset;
 }
@@ -380,7 +388,7 @@ void main_timer_handler(struct pt_regs *regs)
 		}
 
 		monotonic_base += 
-			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+			(offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
 
 		vxtime.last = offset;
 #ifdef CONFIG_X86_PM_TIMER
@@ -389,24 +397,25 @@ void main_timer_handler(struct pt_regs *regs)
 #endif
 	} else {
 		offset = (((tsc - vxtime.last_tsc) *
-			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+			   vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
 
 		if (offset < 0)
 			offset = 0;
 
-		if (offset > (USEC_PER_SEC / HZ)) {
-			lost = offset / (USEC_PER_SEC / HZ);
-			offset %= (USEC_PER_SEC / HZ);
+		if (offset > USEC_PER_TICK) {
+			lost = offset / USEC_PER_TICK;
+			offset %= USEC_PER_TICK;
 		}
 
-		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+		/* FIXME: 1000 or 1000000? */
+		monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;
 
 		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
 
 		if ((((tsc - vxtime.last_tsc) *
-		      vxtime.tsc_quot) >> 32) < offset)
+		      vxtime.tsc_quot) >> US_SCALE) < offset)
 			vxtime.last_tsc = tsc -
-				(((long) offset << 32) / vxtime.tsc_quot) - 1;
+				(((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
 	}
 
 	if (lost > 0) {
@@ -466,16 +475,15 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static unsigned int cyc2ns_scale __read_mostly;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
 static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 {
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
 }
 
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+	return (cyc * cyc2ns_scale) >> NS_SCALE;
 }
 
 unsigned long long sched_clock(void)
@@ -488,7 +496,7 @@ unsigned long long sched_clock(void)
            Disadvantage is a small drift between CPUs in some configurations,
 	   but that should be tolerable. */
 	if (__vxtime.mode == VXTIME_HPET)
-		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
 #endif
 
 	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
@@ -631,7 +639,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 
 		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+			vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
 	}
 	
 	set_cyc2ns_scale(cpu_khz_ref);
@@ -823,8 +831,7 @@ static int hpet_init(void)
 	if (hpet_period < 100000 || hpet_period > 100000000)
 		return -1;
 
-	hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
-		hpet_period;
+	hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
 
 	hpet_use_timer = (id & HPET_ID_LEGSUP);
 
@@ -898,7 +905,7 @@ void __init time_init(void)
 	                        -xtime.tv_sec, -xtime.tv_nsec);
 
 	if (!hpet_init())
-                vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period;
+                vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;
 	else
 		vxtime.hpet_address = 0;
 
@@ -927,8 +934,8 @@ void __init time_init(void)
 	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
 	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
 		cpu_khz / 1000, cpu_khz % 1000);
-	vxtime.quot = (1000000L << 32) / vxtime_hz;
-	vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+	vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+	vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
 	vxtime.last_tsc = get_cycles_sync();
 	setup_irq(0, &irq0);
 
-- 
cgit v1.2.3


From b2df3ddb68fc02e3bae78b7adaeca8561d02ea6d Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:35 +0200
Subject: [PATCH] x86_64: Explain why HPET T0_CMP register is written twice

After writing the CFG register, the first value written to the T0_CMP
register is the value at which next interrupt should be triggered, every
value after that sets the period of the interrupt. For that reason, the code
needs to write the value twice - to set both the phase and period.

[AK: I had already figured it out by myself, but it's still useful
to have a comment for this.]

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 51afb07bc14e..ee622ac0376b 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -795,8 +795,8 @@ static int hpet_timer_stop_set_go(unsigned long tick)
 	if (hpet_use_timer) {
 		hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
 		    HPET_TN_32BIT, HPET_T0_CFG);
-		hpet_writel(hpet_tick, HPET_T0_CMP);
-		hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+		hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+		hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
 		cfg |= HPET_CFG_LEGACY;
 	}
 /*
-- 
cgit v1.2.3


From 2f82bde472184f989283c173c1eb95bab455a774 Mon Sep 17 00:00:00 2001
From: Vojtech Pavlik <vojtech@suse.cz>
Date: Mon, 26 Jun 2006 13:58:38 +0200
Subject: [PATCH] x86_64: Update copyright in time.c

Update my copyright dates in arch/x86-64/kernel/time.c

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index ee622ac0376b..2125d6c05ff7 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -8,7 +8,7 @@
  *  Copyright (c) 1995  Markus Kuhn
  *  Copyright (c) 1996  Ingo Molnar
  *  Copyright (c) 1998  Andrea Arcangeli
- *  Copyright (c) 2002  Vojtech Pavlik
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
  *  Copyright (c) 2003  Andi Kleen
  *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
  */
-- 
cgit v1.2.3


From 17fc14ff1bdbc393e1cf4f6fd1e1e53d72ab9fe5 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:47 +0200
Subject: [PATCH] x86_64: apic support for extended apic interrupt

Add support for extended APIC LVT found in future AMD processors.

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 53fc17d894e8..396e125cb212 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -909,15 +909,13 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
-#ifdef CONFIG_X86_MCE_AMD
-void setup_threshold_lvt(unsigned long lvt_off)
+void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
+			    unsigned char msg_type, unsigned char mask)
 {
-	unsigned int v = 0;
-	unsigned long reg = (lvt_off << 4) + 0x500;
-	v |= THRESHOLD_APIC_VECTOR;
+	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 	apic_write(reg, v);
 }
-#endif /* CONFIG_X86_MCE_AMD */
 
 #undef APIC_DIVISOR
 
-- 
cgit v1.2.3


From fff2e89f11dd9b9b45e9212bc543154ca3d028a1 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:50 +0200
Subject: [PATCH] x86_64: mce_amd relocate sysfs files

Get rid of /sys/devices/system/threshold directory and move
mce_amd thresholding files into the machine sysfs directory --
/sys/devices/system/machinecheck.

AK: Fixed warning

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c     |  2 +-
 arch/x86_64/kernel/mce_amd.c | 40 ++++++++++------------------------------
 2 files changed, 11 insertions(+), 31 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index c69fc43cee7b..acd5816b1a6f 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -562,7 +562,7 @@ static struct sysdev_class mce_sysclass = {
 	set_kset_name("machinecheck"),
 };
 
-static DEFINE_PER_CPU(struct sys_device, device_mce);
+DEFINE_PER_CPU(struct sys_device, device_mce);
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 86e1e022b20e..b96682e5ff77 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -30,7 +30,7 @@
 #include <asm/idle.h>
 
 #define PFX "mce_threshold: "
-#define VERSION "version 1.00.9"
+#define VERSION "version 1.0.10"
 #define NR_BANKS 5
 #define THRESHOLD_MAX 0xFFF
 #define INT_TYPE_APIC 0x00020000
@@ -166,12 +166,6 @@ asmlinkage void mce_threshold_interrupt(void)
  * Sysfs Interface
  */
 
-static struct sysdev_class threshold_sysclass = {
-	set_kset_name("threshold"),
-};
-
-static DEFINE_PER_CPU(struct sys_device, device_threshold);
-
 struct threshold_attr {
         struct attribute attr;
         ssize_t(*show) (struct threshold_bank *, char *);
@@ -332,8 +326,8 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
 		b = per_cpu(threshold_banks, lcpu)[bank];
 		if (!b)
 			goto out;
-		sprintf(name, "bank%i", bank);
-		err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
+		sprintf(name, "threshold_bank%i", bank);
+		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
 					&b->kobj, name);
 		if (err)
 			goto out;
@@ -353,8 +347,8 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
 	b->bank = bank;
 	b->interrupt_enable = 0;
 	b->threshold_limit = THRESHOLD_MAX;
-	kobject_set_name(&b->kobj, "bank%i", bank);
-	b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
+	kobject_set_name(&b->kobj, "threshold_bank%i", bank);
+	b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
 	b->kobj.ktype = &threshold_ktype;
 
 	err = kobject_register(&b->kobj);
@@ -373,12 +367,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
 	int bank;
 	int err = 0;
 
-	per_cpu(device_threshold, cpu).id = cpu;
-	per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
-	err = sysdev_register(&per_cpu(device_threshold, cpu));
-	if (err)
-		goto out;
-
 	for (bank = 0; bank < NR_BANKS; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & 1 << bank))
 			continue;
@@ -407,8 +395,8 @@ static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
 	if (!b)
 		return;
 	if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
-		sprintf(name, "bank%i", bank);
-		sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
+		sprintf(name, "threshold_bank%i", bank);
+		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 	} else {
 		kobject_unregister(&b->kobj);
@@ -425,7 +413,6 @@ static __cpuinit void threshold_remove_device(unsigned int cpu)
 			continue;
 		threshold_remove_bank(cpu, bank);
 	}
-	sysdev_unregister(&per_cpu(device_threshold, cpu));
 }
 
 /* link all existing siblings when first core comes up */
@@ -518,23 +505,16 @@ static struct notifier_block threshold_cpu_notifier = {
 
 static __init int threshold_init_device(void)
 {
-	int err;
 	int lcpu = 0;
 
-	err = sysdev_class_register(&threshold_sysclass);
-	if (err)
-		goto out;
-
 	/* to hit CPUs online before the notifier is up */
 	for_each_online_cpu(lcpu) {
-		err = threshold_create_device(lcpu);
+		int err = threshold_create_device(lcpu);
 		if (err)
-			goto out;
+			return err;
 	}
 	register_cpu_notifier(&threshold_cpu_notifier);
-
-      out:
-	return err;
+	return 0;
 }
 
 device_initcall(threshold_init_device);
-- 
cgit v1.2.3


From 95268664390b19962ed41a3506c5bc8149db71e8 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:53 +0200
Subject: [PATCH] x86_64: mce_amd support for family 0x10 processors

Add support for mce threshold registers found in future
AMD family 0x10 processors.  Backwards compatible with
family 0xF hardware.

AK: fixed build on !SMP

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c | 362 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 288 insertions(+), 74 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index b96682e5ff77..10ffbe52939c 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -1,5 +1,5 @@
 /*
- *  (c) 2005 Advanced Micro Devices, Inc.
+ *  (c) 2005, 2006 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -8,9 +8,10 @@
  *
  *  Support : jacob.shin@amd.com
  *
- *  MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
- *  MC4_MISC0 exists per physical processor.
+ *  April 2006
+ *     - added support for AMD Family 0x10 processors
  *
+ *  All MC4_MISCi registers are shared between multi-cores
  */
 
 #include <linux/cpu.h>
@@ -30,8 +31,9 @@
 #include <asm/idle.h>
 
 #define PFX "mce_threshold: "
-#define VERSION "version 1.0.10"
-#define NR_BANKS 5
+#define VERSION "version 1.1.0"
+#define NR_BANKS 6
+#define NR_BLOCKS 9
 #define THRESHOLD_MAX 0xFFF
 #define INT_TYPE_APIC 0x00020000
 #define MASK_VALID_HI 0x80000000
@@ -40,21 +42,33 @@
 #define MASK_INT_TYPE_HI 0x00060000
 #define MASK_OVERFLOW_HI 0x00010000
 #define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_OVERFLOW 0x0001000000000000L
+#define MASK_BLKPTR_LO    0xFF000000
+#define MCG_XBLK_ADDR     0xC0000400
 
-struct threshold_bank {
+struct threshold_block {
+	unsigned int block;
+	unsigned int bank;
 	unsigned int cpu;
-	u8 bank;
-	u8 interrupt_enable;
+	u32 address;
+	u16 interrupt_enable;
 	u16 threshold_limit;
 	struct kobject kobj;
+	struct list_head miscj;
 };
 
-static struct threshold_bank threshold_defaults = {
+/* defaults used early on boot */
+static struct threshold_block threshold_defaults = {
 	.interrupt_enable = 0,
 	.threshold_limit = THRESHOLD_MAX,
 };
 
+struct threshold_bank {
+	struct kobject kobj;
+	struct threshold_block *blocks;
+	cpumask_t cpus;
+};
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
 #ifdef CONFIG_SMP
 static unsigned char shared_bank[NR_BANKS] = {
 	0, 0, 0, 0, 1
@@ -68,12 +82,12 @@ static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
  */
 
 /* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_bank *b,
+static void threshold_restart_bank(struct threshold_block *b,
 				   int reset, u16 old_limit)
 {
 	u32 mci_misc_hi, mci_misc_lo;
 
-	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+	rdmsr(b->address, mci_misc_lo, mci_misc_hi);
 
 	if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
 		reset = 1;	/* limit cannot be lower than err count */
@@ -94,35 +108,57 @@ static void threshold_restart_bank(struct threshold_bank *b,
 	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
 
 	mci_misc_hi |= MASK_COUNT_EN_HI;
-	wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+	wrmsr(b->address, mci_misc_lo, mci_misc_hi);
 }
 
+/* cpu init entry point, called from mce.c with preempt off */
 void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	int bank;
-	u32 mci_misc_lo, mci_misc_hi;
+	unsigned int bank, block;
 	unsigned int cpu = smp_processor_id();
+	u32 low = 0, high = 0, address = 0;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
-		rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
+		for (block = 0; block < NR_BLOCKS; ++block) {
+			if (block == 0)
+				address = MSR_IA32_MC0_MISC + bank * 4;
+			else if (block == 1)
+				address = MCG_XBLK_ADDR
+					+ ((low & MASK_BLKPTR_LO) >> 21);
+			else
+				++address;
+
+			if (rdmsr_safe(address, &low, &high))
+				continue;
 
-		/* !valid, !counter present, bios locked */
-		if (!(mci_misc_hi & MASK_VALID_HI) ||
-		    !(mci_misc_hi & MASK_VALID_HI >> 1) ||
-		    (mci_misc_hi & MASK_VALID_HI >> 2))
-			continue;
+			if (!(high & MASK_VALID_HI)) {
+				if (block)
+					continue;
+				else
+					break;
+			}
 
-		per_cpu(bank_map, cpu) |= (1 << bank);
+			if (!(high & MASK_VALID_HI >> 1)  ||
+			     (high & MASK_VALID_HI >> 2))
+				continue;
 
+			if (!block)
+				per_cpu(bank_map, cpu) |= (1 << bank);
 #ifdef CONFIG_SMP
-		if (shared_bank[bank] && c->cpu_core_id)
-			continue;
+			if (shared_bank[bank] && c->cpu_core_id)
+				break;
 #endif
+			high &= ~MASK_LVTOFF_HI;
+			high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+			wrmsr(address, low, high);
+
+			setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+					       THRESHOLD_APIC_VECTOR,
+					       K8_APIC_EXT_INT_MSG_FIX, 0);
 
-		setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
-		threshold_defaults.cpu = cpu;
-		threshold_defaults.bank = bank;
-		threshold_restart_bank(&threshold_defaults, 0, 0);
+			threshold_defaults.address = address;
+			threshold_restart_bank(&threshold_defaults, 0, 0);
+		}
 	}
 }
 
@@ -137,8 +173,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
  */
 asmlinkage void mce_threshold_interrupt(void)
 {
-	int bank;
+	unsigned int bank, block;
 	struct mce m;
+	u32 low = 0, high = 0, address = 0;
 
 	ack_APIC_irq();
 	exit_idle();
@@ -150,12 +187,39 @@ asmlinkage void mce_threshold_interrupt(void)
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
-		m.bank = MCE_THRESHOLD_BASE + bank;
-		rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
+		for (block = 0; block < NR_BLOCKS; ++block) {
+			if (block == 0)
+				address = MSR_IA32_MC0_MISC + bank * 4;
+			else if (block == 1)
+				address = MCG_XBLK_ADDR
+					+ ((low & MASK_BLKPTR_LO) >> 21);
+			else
+				++address;
+
+			if (rdmsr_safe(address, &low, &high))
+				continue;
 
-		if (m.misc & MASK_OVERFLOW) {
-			mce_log(&m);
-			goto out;
+			if (!(high & MASK_VALID_HI)) {
+				if (block)
+					continue;
+				else
+					break;
+			}
+
+			if (!(high & MASK_VALID_HI >> 1)  ||
+			     (high & MASK_VALID_HI >> 2))
+				continue;
+
+			if (high & MASK_OVERFLOW_HI) {
+				rdmsrl(address, m.misc);
+				rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
+				       m.status);
+				m.bank = K8_MCE_THRESHOLD_BASE
+				       + bank * NR_BLOCKS
+				       + block;
+				mce_log(&m);
+				goto out;
+			}
 		}
 	}
       out:
@@ -168,12 +232,10 @@ asmlinkage void mce_threshold_interrupt(void)
 
 struct threshold_attr {
         struct attribute attr;
-        ssize_t(*show) (struct threshold_bank *, char *);
-        ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
+	ssize_t(*show) (struct threshold_block *, char *);
+	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
 
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
-
 static cpumask_t affinity_set(unsigned int cpu)
 {
 	cpumask_t oldmask = current->cpus_allowed;
@@ -189,14 +251,14 @@ static void affinity_restore(cpumask_t oldmask)
 }
 
 #define SHOW_FIELDS(name) \
-        static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
+        static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
         { \
                 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
         }
 SHOW_FIELDS(interrupt_enable)
 SHOW_FIELDS(threshold_limit)
 
-static ssize_t store_interrupt_enable(struct threshold_bank *b,
+static ssize_t store_interrupt_enable(struct threshold_block *b,
 				      const char *buf, size_t count)
 {
 	char *end;
@@ -213,7 +275,7 @@ static ssize_t store_interrupt_enable(struct threshold_bank *b,
 	return end - buf;
 }
 
-static ssize_t store_threshold_limit(struct threshold_bank *b,
+static ssize_t store_threshold_limit(struct threshold_block *b,
 				     const char *buf, size_t count)
 {
 	char *end;
@@ -236,18 +298,18 @@ static ssize_t store_threshold_limit(struct threshold_bank *b,
 	return end - buf;
 }
 
-static ssize_t show_error_count(struct threshold_bank *b, char *buf)
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
 	u32 high, low;
 	cpumask_t oldmask;
 	oldmask = affinity_set(b->cpu);
-	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
+	rdmsr(b->address, low, high);
 	affinity_restore(oldmask);
 	return sprintf(buf, "%x\n",
 		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
 }
 
-static ssize_t store_error_count(struct threshold_bank *b,
+static ssize_t store_error_count(struct threshold_block *b,
 				 const char *buf, size_t count)
 {
 	cpumask_t oldmask;
@@ -278,12 +340,12 @@ static struct attribute *default_attrs[] = {
 	NULL
 };
 
-#define to_bank(k) container_of(k,struct threshold_bank,kobj)
+#define to_block(k) container_of(k, struct threshold_block, kobj)
 #define to_attr(a) container_of(a,struct threshold_attr,attr)
 
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
-	struct threshold_bank *b = to_bank(kobj);
+	struct threshold_block *b = to_block(kobj);
 	struct threshold_attr *a = to_attr(attr);
 	ssize_t ret;
 	ret = a->show ? a->show(b, buf) : -EIO;
@@ -293,7 +355,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 static ssize_t store(struct kobject *kobj, struct attribute *attr,
 		     const char *buf, size_t count)
 {
-	struct threshold_bank *b = to_bank(kobj);
+	struct threshold_block *b = to_block(kobj);
 	struct threshold_attr *a = to_attr(attr);
 	ssize_t ret;
 	ret = a->store ? a->store(b, buf, count) : -EIO;
@@ -310,53 +372,164 @@ static struct kobj_type threshold_ktype = {
 	.default_attrs = default_attrs,
 };
 
+static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
+					       unsigned int bank,
+					       unsigned int block,
+					       u32 address)
+{
+	int err;
+	u32 low, high;
+	struct threshold_block *b = NULL;
+
+	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
+		return 0;
+
+	if (rdmsr_safe(address, &low, &high))
+		goto recurse;
+
+	if (!(high & MASK_VALID_HI)) {
+		if (block)
+			goto recurse;
+		else
+			return 0;
+	}
+
+	if (!(high & MASK_VALID_HI >> 1)  ||
+	     (high & MASK_VALID_HI >> 2))
+		goto recurse;
+
+	b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	memset(b, 0, sizeof(struct threshold_block));
+
+	b->block = block;
+	b->bank = bank;
+	b->cpu = cpu;
+	b->address = address;
+	b->interrupt_enable = 0;
+	b->threshold_limit = THRESHOLD_MAX;
+
+	INIT_LIST_HEAD(&b->miscj);
+
+	if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+		list_add(&b->miscj,
+			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+	else
+		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+
+	kobject_set_name(&b->kobj, "misc%i", block);
+	b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
+	b->kobj.ktype = &threshold_ktype;
+	err = kobject_register(&b->kobj);
+	if (err)
+		goto out_free;
+recurse:
+	if (!block) {
+		address = (low & MASK_BLKPTR_LO) >> 21;
+		if (!address)
+			return 0;
+		address += MCG_XBLK_ADDR;
+	} else
+		++address;
+
+	err = allocate_threshold_blocks(cpu, bank, ++block, address);
+	if (err)
+		goto out_free;
+
+	return err;
+
+out_free:
+	if (b) {
+		kobject_unregister(&b->kobj);
+		kfree(b);
+	}
+	return err;
+}
+
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
+static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
-	int err = 0;
+	int i, err = 0;
 	struct threshold_bank *b = NULL;
+	cpumask_t oldmask = CPU_MASK_NONE;
+	char name[32];
+
+	sprintf(name, "threshold_bank%i", bank);
 
 #ifdef CONFIG_SMP
 	if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {	/* symlink */
-		char name[16];
-		unsigned lcpu = first_cpu(cpu_core_map[cpu]);
-		if (cpu_data[lcpu].cpu_core_id)
-			goto out;	/* first core not up yet */
+		i = first_cpu(cpu_core_map[cpu]);
+
+		/* first core not up yet */
+		if (cpu_data[i].cpu_core_id)
+			goto out;
+
+		/* already linked */
+		if (per_cpu(threshold_banks, cpu)[bank])
+			goto out;
+
+		b = per_cpu(threshold_banks, i)[bank];
 
-		b = per_cpu(threshold_banks, lcpu)[bank];
 		if (!b)
 			goto out;
-		sprintf(name, "threshold_bank%i", bank);
+
 		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
 					&b->kobj, name);
 		if (err)
 			goto out;
+
+		b->cpus = cpu_core_map[cpu];
 		per_cpu(threshold_banks, cpu)[bank] = b;
 		goto out;
 	}
 #endif
 
-	b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
 	if (!b) {
 		err = -ENOMEM;
 		goto out;
 	}
 	memset(b, 0, sizeof(struct threshold_bank));
 
-	b->cpu = cpu;
-	b->bank = bank;
-	b->interrupt_enable = 0;
-	b->threshold_limit = THRESHOLD_MAX;
 	kobject_set_name(&b->kobj, "threshold_bank%i", bank);
 	b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
-	b->kobj.ktype = &threshold_ktype;
-
+#ifndef CONFIG_SMP
+	b->cpus = CPU_MASK_ALL;
+#else
+	b->cpus = cpu_core_map[cpu];
+#endif
 	err = kobject_register(&b->kobj);
-	if (err) {
-		kfree(b);
-		goto out;
-	}
+	if (err)
+		goto out_free;
+
 	per_cpu(threshold_banks, cpu)[bank] = b;
+
+	oldmask = affinity_set(cpu);
+	err = allocate_threshold_blocks(cpu, bank, 0,
+					MSR_IA32_MC0_MISC + bank * 4);
+	affinity_restore(oldmask);
+
+	if (err)
+		goto out_free;
+
+	for_each_cpu_mask(i, b->cpus) {
+		if (i == cpu)
+			continue;
+
+		err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+					&b->kobj, name);
+		if (err)
+			goto out;
+
+		per_cpu(threshold_banks, i)[bank] = b;
+	}
+
+	goto out;
+
+out_free:
+	per_cpu(threshold_banks, cpu)[bank] = NULL;
+	kfree(b);
       out:
 	return err;
 }
@@ -385,23 +558,64 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
  *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
  */
 
-/* cpu hotplug call removes all symlinks before first core dies */
+static __cpuinit void deallocate_threshold_block(unsigned int cpu,
+						 unsigned int bank)
+{
+	struct threshold_block *pos = NULL;
+	struct threshold_block *tmp = NULL;
+	struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+	if (!head)
+		return;
+
+	list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+		kobject_unregister(&pos->kobj);
+		list_del(&pos->miscj);
+		kfree(pos);
+	}
+
+	kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+
 static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
 {
+	int i = 0;
 	struct threshold_bank *b;
-	char name[16];
+	char name[32];
 
 	b = per_cpu(threshold_banks, cpu)[bank];
+
 	if (!b)
 		return;
-	if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
-		sprintf(name, "threshold_bank%i", bank);
+
+	if (!b->blocks)
+		goto free_out;
+
+	sprintf(name, "threshold_bank%i", bank);
+
+	/* sibling symlink */
+	if (shared_bank[bank] && b->blocks->cpu != cpu) {
 		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
-		per_cpu(threshold_banks, cpu)[bank] = NULL;
-	} else {
-		kobject_unregister(&b->kobj);
-		kfree(per_cpu(threshold_banks, cpu)[bank]);
+		per_cpu(threshold_banks, i)[bank] = NULL;
+		return;
 	}
+
+	/* remove all sibling symlinks before unregistering */
+	for_each_cpu_mask(i, b->cpus) {
+		if (i == cpu)
+			continue;
+
+		sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+		per_cpu(threshold_banks, i)[bank] = NULL;
+	}
+
+	deallocate_threshold_block(cpu, bank);
+
+free_out:
+	kobject_unregister(&b->kobj);
+	kfree(b);
+	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
 
 static __cpuinit void threshold_remove_device(unsigned int cpu)
-- 
cgit v1.2.3


From 2903ee85ce462d66955b800a0c48e26e51de0aae Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Mon, 26 Jun 2006 13:58:56 +0200
Subject: [PATCH] x86_64: mce_amd cleanup

Clean up mce_amd.c for readability and remove code no
longer needed.

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c | 112 +++++++++++--------------------------------
 1 file changed, 29 insertions(+), 83 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 10ffbe52939c..3f967d4fe199 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -30,17 +30,17 @@
 #include <asm/percpu.h>
 #include <asm/idle.h>
 
-#define PFX "mce_threshold: "
-#define VERSION "version 1.1.0"
-#define NR_BANKS 6
-#define NR_BLOCKS 9
-#define THRESHOLD_MAX 0xFFF
-#define INT_TYPE_APIC 0x00020000
-#define MASK_VALID_HI 0x80000000
-#define MASK_LVTOFF_HI 0x00F00000
-#define MASK_COUNT_EN_HI 0x00080000
-#define MASK_INT_TYPE_HI 0x00060000
-#define MASK_OVERFLOW_HI 0x00010000
+#define PFX               "mce_threshold: "
+#define VERSION           "version 1.1.1"
+#define NR_BANKS          6
+#define NR_BLOCKS         9
+#define THRESHOLD_MAX     0xFFF
+#define INT_TYPE_APIC     0x00020000
+#define MASK_VALID_HI     0x80000000
+#define MASK_LVTOFF_HI    0x00F00000
+#define MASK_COUNT_EN_HI  0x00080000
+#define MASK_INT_TYPE_HI  0x00060000
+#define MASK_OVERFLOW_HI  0x00010000
 #define MASK_ERR_COUNT_HI 0x00000FFF
 #define MASK_BLKPTR_LO    0xFF000000
 #define MCG_XBLK_ADDR     0xC0000400
@@ -222,7 +222,7 @@ asmlinkage void mce_threshold_interrupt(void)
 			}
 		}
 	}
-      out:
+out:
 	irq_exit();
 }
 
@@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void)
  */
 
 struct threshold_attr {
-        struct attribute attr;
+	struct attribute attr;
 	ssize_t(*show) (struct threshold_block *, char *);
 	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
@@ -250,11 +250,11 @@ static void affinity_restore(cpumask_t oldmask)
 	set_cpus_allowed(current, oldmask);
 }
 
-#define SHOW_FIELDS(name) \
-        static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
-        { \
-                return sprintf(buf, "%lx\n", (unsigned long) b->name); \
-        }
+#define SHOW_FIELDS(name)                                           \
+static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
+{                                                                   \
+        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \
+}
 SHOW_FIELDS(interrupt_enable)
 SHOW_FIELDS(threshold_limit)
 
@@ -325,13 +325,13 @@ static ssize_t store_error_count(struct threshold_block *b,
         .store = _store,                                      \
 };
 
-#define ATTR_FIELDS(name) \
-        static struct threshold_attr name = \
+#define RW_ATTR(name)                                           \
+static struct threshold_attr name =                             \
         THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
 
-ATTR_FIELDS(interrupt_enable);
-ATTR_FIELDS(threshold_limit);
-ATTR_FIELDS(error_count);
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+RW_ATTR(error_count);
 
 static struct attribute *default_attrs[] = {
 	&interrupt_enable.attr,
@@ -341,7 +341,7 @@ static struct attribute *default_attrs[] = {
 };
 
 #define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a,struct threshold_attr,attr)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
 
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
@@ -530,14 +530,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 out_free:
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 	kfree(b);
-      out:
+out:
 	return err;
 }
 
 /* create dir/files for all valid threshold banks */
 static __cpuinit int threshold_create_device(unsigned int cpu)
 {
-	int bank;
+	unsigned int bank;
 	int err = 0;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -547,7 +547,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
 		if (err)
 			goto out;
 	}
-      out:
+out:
 	return err;
 }
 
@@ -620,7 +620,7 @@ free_out:
 
 static __cpuinit void threshold_remove_device(unsigned int cpu)
 {
-	int bank;
+	unsigned int bank;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & 1 << bank))
@@ -629,54 +629,7 @@ static __cpuinit void threshold_remove_device(unsigned int cpu)
 	}
 }
 
-/* link all existing siblings when first core comes up */
-static __cpuinit int threshold_create_symlinks(unsigned int cpu)
-{
-	int bank, err = 0;
-	unsigned int lcpu = 0;
-
-	if (cpu_data[cpu].cpu_core_id)
-		return 0;
-	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
-		if (lcpu == cpu)
-			continue;
-		for (bank = 0; bank < NR_BANKS; ++bank) {
-			if (!(per_cpu(bank_map, cpu) & 1 << bank))
-				continue;
-			if (!shared_bank[bank])
-				continue;
-			err = threshold_create_bank(lcpu, bank);
-		}
-	}
-	return err;
-}
-
-/* remove all symlinks before first core dies. */
-static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
-{
-	int bank;
-	unsigned int lcpu = 0;
-	if (cpu_data[cpu].cpu_core_id)
-		return;
-	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
-		if (lcpu == cpu)
-			continue;
-		for (bank = 0; bank < NR_BANKS; ++bank) {
-			if (!(per_cpu(bank_map, cpu) & 1 << bank))
-				continue;
-			if (!shared_bank[bank])
-				continue;
-			threshold_remove_bank(lcpu, bank);
-		}
-	}
-}
 #else /* !CONFIG_HOTPLUG_CPU */
-static __cpuinit void threshold_create_symlinks(unsigned int cpu)
-{
-}
-static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
-{
-}
 static void threshold_remove_device(unsigned int cpu)
 {
 }
@@ -695,13 +648,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_ONLINE:
 		threshold_create_device(cpu);
-		threshold_create_symlinks(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		threshold_remove_symlinks(cpu);
-		break;
-	case CPU_DOWN_FAILED:
-		threshold_create_symlinks(cpu);
 		break;
 	case CPU_DEAD:
 		threshold_remove_device(cpu);
@@ -719,7 +665,7 @@ static struct notifier_block threshold_cpu_notifier = {
 
 static __init int threshold_init_device(void)
 {
-	int lcpu = 0;
+	unsigned lcpu = 0;
 
 	/* to hit CPUs online before the notifier is up */
 	for_each_online_cpu(lcpu) {
-- 
cgit v1.2.3


From b633237e9c1b91b86c431c5d41266b47408b4642 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:58:59 +0200
Subject: [PATCH] x86_64: Mark mce_amd cpu notifier __cpuinit/__cpuinitdata

Cc: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 3f967d4fe199..335200aa2737 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -636,7 +636,7 @@ static void threshold_remove_device(unsigned int cpu)
 #endif
 
 /* get notified when a cpu comes on/off */
-static int threshold_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
 					    unsigned long action, void *hcpu)
 {
 	/* cpu was unsigned int to begin with */
@@ -659,7 +659,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block threshold_cpu_notifier = {
+static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
 	.notifier_call = threshold_cpu_callback,
 };
 
-- 
cgit v1.2.3


From 8fa3d6fc5e952300c26490167a93bf502de03a99 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 26 Jun 2006 13:59:05 +0200
Subject: [PATCH] x86_64: check_addr() cleanups

 - Use DMA_32BIT_MASK

 - Use %z for size_t

 - 80-cols

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-nommu.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 1f6ecc62061d..c4c3cc36ac5b 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -4,6 +4,8 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/string.h>
+#include <linux/dma-mapping.h>
+
 #include <asm/proto.h>
 #include <asm/processor.h>
 #include <asm/dma.h>
@@ -12,10 +14,11 @@ static int
 check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 {
         if (hwdev && bus + size > *hwdev->dma_mask) {
-		if (*hwdev->dma_mask >= 0xffffffffULL)
+		if (*hwdev->dma_mask >= DMA_32BIT_MASK)
 			printk(KERN_ERR
-			    "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
-	       			name, (long long)bus, size, (long long)*hwdev->dma_mask);
+			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
+				name, (long long)bus, size,
+				(long long)*hwdev->dma_mask);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3


From d9005b52de7bc9b20afa93e06d3e15843bafc12f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:59:08 +0200
Subject: [PATCH] x86_64: Remove bogus RED-PEN comment in signal.c

No red zone possible/needed on the alternative stack.

It caused confusion.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 94b4b1128ac8..28161170fb0a 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -237,7 +237,6 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
 	rsp = regs->rsp - 128;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
-	/* RED-PEN: redzone on that stack? */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
 		if (sas_ss_flags(rsp) == 0)
 			rsp = current->sas_ss_sp + current->sas_ss_size;
-- 
cgit v1.2.3


From 495ab9c045e1b0e5c82951b762257fe1c9d81564 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:59:11 +0200
Subject: [PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status

During some profiling I noticed that default_idle causes a lot of
memory traffic. I think that is caused by the atomic operations
to clear/set the polling flag in thread_info. There is actually
no reason to make this atomic - only the idle thread does it
to itself, other CPUs only read it. So I moved it into ti->status.

Converted i386/x86-64/ia64 for now because that was the easiest
way to fix ACPI which also manipulates these flags in its idle
function.

Cc: Nick Piggin <npiggin@novell.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index d6fa41459c80..b596837a1527 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -110,7 +110,7 @@ static void default_idle(void)
 {
 	local_irq_enable();
 
-	clear_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status &= ~TS_POLLING;
 	smp_mb__after_clear_bit();
 	while (!need_resched()) {
 		local_irq_disable();
@@ -119,7 +119,7 @@ static void default_idle(void)
 		else
 			local_irq_enable();
 	}
-	set_thread_flag(TIF_POLLING_NRFLAG);
+	current_thread_info()->status |= TS_POLLING;
 }
 
 /*
@@ -202,8 +202,7 @@ static inline void play_dead(void)
  */
 void cpu_idle (void)
 {
-	set_thread_flag(TIF_POLLING_NRFLAG);
-
+	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
-- 
cgit v1.2.3


From e42f94373764d825e2c0464092738f66f5d3effb Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohitseth@google.com>
Date: Mon, 26 Jun 2006 13:59:14 +0200
Subject: [PATCH] x86_64: x86_64 setup.c - printing cmp related boottime
 information

Getting phys_proc_id and cpu_core_id information to be printed at boot
time for AMD processors.  Also matching the Node related boot time
information that gets printed for Intel and AMD processors for NUMA
configurations.

Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 923b76fb0aa6..8ade23d7ae76 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -833,15 +833,13 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
  	}
 	numa_set_node(cpu, node);
 
-  	printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n",
-  			cpu, apicid, c->x86_max_cores, node, c->cpu_core_id);
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
 #endif
 #endif
 }
 
-static int __init init_amd(struct cpuinfo_x86 *c)
+static void __init init_amd(struct cpuinfo_x86 *c)
 {
-	int r;
 	unsigned level;
 
 #ifdef CONFIG_SMP
@@ -874,8 +872,8 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
 
-	r = get_model_name(c);
-	if (!r) { 
+	level = get_model_name(c);
+	if (!level) {
 		switch (c->x86) { 
 		case 15:
 			/* Should distinguish Models here, but this is only
@@ -896,8 +894,6 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 
 	/* Fix cpuid4 emulation for more */
 	num_cache_leaves = 3;
-
-	return r;
 }
 
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -909,8 +905,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
 
-	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
+	if (!cpu_has(c, X86_FEATURE_HT))
 		return;
+ 	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		goto out;
 
 	smp_num_siblings = (ebx & 0xff0000) >> 16;
 
@@ -927,9 +925,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 		index_msb = get_count_order(smp_num_siblings);
 		c->phys_proc_id = phys_pkg_id(index_msb);
 
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-
 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
 		index_msb = get_count_order(smp_num_siblings) ;
@@ -938,11 +933,13 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 		c->cpu_core_id = phys_pkg_id(index_msb) &
 					       ((1 << core_bits) - 1);
-
-		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       c->cpu_core_id);
 	}
+out:
+	if ((c->x86_max_cores * smp_num_siblings) > 1) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+	}
+
 #endif
 }
 
@@ -969,16 +966,17 @@ static void srat_detect_node(void)
 #ifdef CONFIG_NUMA
 	unsigned node;
 	int cpu = smp_processor_id();
+	int apicid = hard_smp_processor_id();
 
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
-	node = apicid_to_node[hard_smp_processor_id()];
+	node = apicid_to_node[apicid];
 	if (node == NUMA_NO_NODE)
 		node = first_node(node_online_map);
 	numa_set_node(cpu, node);
 
 	if (acpi_numa > 0)
-		printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
+		printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
 #endif
 }
 
-- 
cgit v1.2.3


From 75bd665cc9f4d0f67164d9145e446ae554d4ed3f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:59:17 +0200
Subject: [PATCH] x86_64: Fix fast check in safe_smp_processor_id

The APIC ID returned by hard_smp_processor_id can be beyond
NR_CPUS and then overflow the x86_cpu_to_apic[] array.

Add a check for overflow. If it happens then the slow loop below
will catch.

Bug pointed out by Doug Thompson

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index fe906ba1cf45..5885b8f044b2 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -520,13 +520,13 @@ asmlinkage void smp_call_function_interrupt(void)
 
 int safe_smp_processor_id(void)
 {
-	int apicid, i;
+	unsigned apicid, i;
 
 	if (disable_apic)
 		return 0;
 
 	apicid = hard_smp_processor_id();
-	if (x86_cpu_to_apicid[apicid] == apicid)
+	if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
 		return apicid;
 
 	for (i = 0; i < NR_CPUS; ++i) {
-- 
cgit v1.2.3


From bdbdaa791fb5b97ba5b3124c7593ffe308e2afef Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:59:23 +0200
Subject: [PATCH] i386/x86-64: adjust /proc/interrupts column headings

With (significantly) more than 10 CPUs online, the column headings
drifted off the positions of the column contents with growing CPU
numbers.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index d8bd0b345b1e..207ecdc39822 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -39,7 +39,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	if (i == 0) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
-			seq_printf(p, "CPU%d       ",j);
+			seq_printf(p, "CPU%-8d",j);
 		seq_putc(p, '\n');
 	}
 
-- 
cgit v1.2.3


From cab093b9d4b40c71c6261a11ba8f1ca027e09008 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 26 Jun 2006 13:59:26 +0200
Subject: [PATCH] x86_64: adjust kstack_depth_to_print default

Defaulting to a value not evenly divisible by four makes little sense,
as four values are displayed per line (and hence the rest of the line
would otherwise be wasted).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index bd0891f4c2c7..08f24359bfc1 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -106,7 +106,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_enable_no_resched();
 }
 
-static int kstack_depth_to_print = 10;
+static int kstack_depth_to_print = 12;
 static int call_trace = 1;
 
 #ifdef CONFIG_KALLSYMS
-- 
cgit v1.2.3


From 9c63f8738734eb7e6d3f76ca03186f16ef88edf5 Mon Sep 17 00:00:00 2001
From: Piotr Kaczuba <pepe@attika.ath.cx>
Date: Mon, 26 Jun 2006 13:59:38 +0200
Subject: [PATCH] x86_64: Fix modular pc speaker

It turned out that the following change is needed when the speaker is
compiled as a module.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 8ade23d7ae76..0d2d4f67aa73 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1361,7 +1361,7 @@ struct seq_operations cpuinfo_op = {
 	.show =	show_cpuinfo,
 };
 
-#ifdef CONFIG_INPUT_PCSPKR
+#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
 #include <linux/platform_device.h>
 static __init int add_pcspkr(void)
 {
-- 
cgit v1.2.3


From 45486f81c9aa07218b73a38cbcf62ffa66e99088 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Mon, 26 Jun 2006 13:59:41 +0200
Subject: [PATCH] x86_64: Standardize i386/x86_64 handling of NMI_VECTOR

x86_64 and i386 behave inconsistently when sending an IPI on vector 2
(NMI_VECTOR).  Make both behave the same, so IPI 2 is sent as NMI.

The crash code was abusing send_IPI_allbutself() by passing a code
instead of a vector, it only worked because crash knew about the
internal code of send_IPI_allbutself().  Change crash to use NMI_VECTOR
instead, and remove the comment about how crash was abusing the function.

This patch is a pre-requisite for fixing the problem where sending an
IPI as NMI would reboot some Dell Xeon systems.  I cannot fix that
problem while crash continus to abuse send_IPI_allbutself().

It also removes the inconsistency between i386 and x86_64 for
NMI_VECTOR.  That will simplify all the RAS code that needs to bring
all the cpus to a clean stop, even when one or more cpus are spinning
disabled.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/crash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index ec1c7431d5af..8ca04912b1cc 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -118,7 +118,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 
 static void smp_send_nmi_allbutself(void)
 {
-	send_IPI_allbutself(APIC_DM_NMI);
+	send_IPI_allbutself(NMI_VECTOR);
 }
 
 /*
-- 
cgit v1.2.3


From 2ee60e17896c65da1df5780d3196c050bccb7d10 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:59:44 +0200
Subject: [PATCH] x86_64: Move export symbols to their C functions

Only exports for assembler files are left in x8664_ksyms.c

Originally inspired by a patch from Al Viro
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c        |   1 +
 arch/x86_64/kernel/process.c     |   1 +
 arch/x86_64/kernel/reboot.c      |   1 +
 arch/x86_64/kernel/setup.c       |   2 +
 arch/x86_64/kernel/setup64.c     |   2 +
 arch/x86_64/kernel/smp.c         |   4 ++
 arch/x86_64/kernel/smpboot.c     |   5 ++
 arch/x86_64/kernel/time.c        |   2 +
 arch/x86_64/kernel/traps.c       |   2 +
 arch/x86_64/kernel/x8664_ksyms.c | 114 ++-------------------------------------
 10 files changed, 25 insertions(+), 109 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index d8ea250f3925..9e94d834624b 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -620,6 +620,7 @@ void __init parse_memmapopt(char *p, char **from)
 }
 
 unsigned long pci_mem_start = 0xaeedbabe;
+EXPORT_SYMBOL(pci_mem_start);
 
 /*
  * Search for the biggest gap in the low 32 bits of the e820
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index b596837a1527..ca56e19b8b6e 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(boot_option_idle_override);
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 57117b8beb2b..2d6769847456 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -20,6 +20,7 @@
  * Power off function, if any
  */
 void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
 
 static long no_idt[3];
 static enum { 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0d2d4f67aa73..2a5fce0fd1c4 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -71,6 +71,7 @@
  */
 
 struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
 
@@ -99,6 +100,7 @@ char dmi_alloc_data[DMI_MAX_DATA];
  * Setup options
  */
 struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
 struct sys_desc_table_struct {
 	unsigned short length;
 	unsigned char table[0];
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index e5bf22a01edb..f5934cb4a2b6 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -30,6 +30,7 @@ char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
 struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
@@ -37,6 +38,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
 unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL(__supported_pte_mask);
 static int do_not_nx __cpuinitdata = 0;
 
 /* noexec=on|off
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 5885b8f044b2..8188bae9c6d5 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -224,6 +224,7 @@ void flush_tlb_current_task(void)
 		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
 	preempt_enable();
 }
+EXPORT_SYMBOL(flush_tlb_current_task);
 
 void flush_tlb_mm (struct mm_struct * mm)
 {
@@ -244,6 +245,7 @@ void flush_tlb_mm (struct mm_struct * mm)
 
 	preempt_enable();
 }
+EXPORT_SYMBOL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 {
@@ -266,6 +268,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 
 	preempt_enable();
 }
+EXPORT_SYMBOL(flush_tlb_page);
 
 static void do_flush_tlb_all(void* info)
 {
@@ -443,6 +446,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 	spin_unlock(&call_lock);
 	return 0;
 }
+EXPORT_SYMBOL(smp_call_function);
 
 void smp_stop_cpu(void)
 {
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index b1c10b154bfe..4e9755179ecf 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -63,9 +63,11 @@
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
 
 /* Last level cache ID of each logical CPU */
 u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+EXPORT_SYMBOL(cpu_llc_id);
 
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
@@ -78,18 +80,21 @@ EXPORT_SYMBOL(cpu_online_map);
  */
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
 
 cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
 
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
 
 /* representing HT and core siblings of each logical CPU */
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2125d6c05ff7..ebbee6f59ff5 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -51,6 +51,7 @@ extern int using_apic_timer;
 static char *time_init_gtod(void);
 
 DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
 DEFINE_SPINLOCK(i8253_lock);
 
 int nohpet __initdata = 0;
@@ -64,6 +65,7 @@ static int notsc __initdata = 0;
 #define US_SCALE	32 /* 2^32, arbitralrily chosen */
 
 unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
 int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 08f24359bfc1..3d11a2fe45b7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -70,6 +70,7 @@ asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
 ATOMIC_NOTIFIER_HEAD(die_chain);
+EXPORT_SYMBOL(die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
@@ -431,6 +432,7 @@ void out_of_line_bug(void)
 { 
 	BUG(); 
 } 
+EXPORT_SYMBOL(out_of_line_bug);
 #endif
 
 static DEFINE_SPINLOCK(die_lock);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 1def21c9f7cd..370952c4ff22 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -1,66 +1,21 @@
+/* Exports for assembly files.
+   All C exports should go in the respective C files. */
+
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/smp.h>
-#include <linux/user.h>
-#include <linux/sched.h>
-#include <linux/in6.h>
-#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
-#include <linux/pm.h>
-#include <linux/pci.h>
-#include <linux/apm_bios.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/tty.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
 #include <asm/uaccess.h>
-#include <asm/checksum.h>
-#include <asm/io.h>
-#include <asm/delay.h>
-#include <asm/irq.h>
-#include <asm/mmx.h>
-#include <asm/desc.h>
 #include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/nmi.h>
-#include <asm/kdebug.h>
-#include <asm/unistd.h>
-#include <asm/tlbflush.h>
-#include <asm/kdebug.h>
-
-extern spinlock_t rtc_lock;
 
-#ifdef CONFIG_SMP
-extern void __write_lock_failed(rwlock_t *rw);
-extern void __read_lock_failed(rwlock_t *rw);
-#endif
-
-/* platform dependent support */
-EXPORT_SYMBOL(boot_cpu_data);
-//EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(__ioremap);
-EXPORT_SYMBOL(ioremap_nocache);
-EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(pm_idle);
-EXPORT_SYMBOL(pm_power_off);
 
 EXPORT_SYMBOL(__down_failed);
 EXPORT_SYMBOL(__down_failed_interruptible);
 EXPORT_SYMBOL(__down_failed_trylock);
 EXPORT_SYMBOL(__up_wakeup);
-/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
-EXPORT_SYMBOL(ip_compute_csum);
-/* Delay loops */
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__ndelay);
-EXPORT_SYMBOL(__delay);
-EXPORT_SYMBOL(__const_udelay);
 
 EXPORT_SYMBOL(__get_user_1);
 EXPORT_SYMBOL(__get_user_2);
@@ -71,42 +26,20 @@ EXPORT_SYMBOL(__put_user_2);
 EXPORT_SYMBOL(__put_user_4);
 EXPORT_SYMBOL(__put_user_8);
 
-EXPORT_SYMBOL(strncpy_from_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-EXPORT_SYMBOL(clear_user);
-EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(copy_user_generic);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(copy_in_user);
-EXPORT_SYMBOL(strnlen_user);
-
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
 
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
-EXPORT_SYMBOL(_cpu_pda);
 #ifdef CONFIG_SMP
-EXPORT_SYMBOL(cpu_data);
+extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
 EXPORT_SYMBOL(__write_lock_failed);
 EXPORT_SYMBOL(__read_lock_failed);
-
-EXPORT_SYMBOL(smp_call_function);
-EXPORT_SYMBOL(cpu_callout_map);
-#endif
-
-#ifdef CONFIG_VT
-EXPORT_SYMBOL(screen_info);
 #endif
 
-EXPORT_SYMBOL(rtc_lock);
-
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
 /* Export string functions. We normally rely on gcc builtin for most of these,
    but gcc sometimes decides not to inline them. */    
 #undef memcpy
@@ -114,51 +47,14 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
 #undef memmove
 
 extern void * memset(void *,int,__kernel_size_t);
-extern size_t strlen(const char *);
-extern void * memmove(void * dest,const void *src,size_t count);
 extern void * memcpy(void *,const void *,__kernel_size_t);
 extern void * __memcpy(void *,const void *,__kernel_size_t);
 
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
 
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-/* prototypes are wrong, these are assembly with custom calling functions */
-extern void rwsem_down_read_failed_thunk(void);
-extern void rwsem_wake_thunk(void);
-extern void rwsem_downgrade_thunk(void);
-extern void rwsem_down_write_failed_thunk(void);
-EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
-EXPORT_SYMBOL(rwsem_wake_thunk);
-EXPORT_SYMBOL(rwsem_downgrade_thunk);
-EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
-#endif
-
 EXPORT_SYMBOL(empty_zero_page);
-
-EXPORT_SYMBOL(die_chain);
-
-#ifdef CONFIG_SMP
-EXPORT_SYMBOL(cpu_sibling_map);
-EXPORT_SYMBOL(smp_num_siblings);
-#endif
-
-#ifdef CONFIG_BUG
-EXPORT_SYMBOL(out_of_line_bug);
-#endif
-
 EXPORT_SYMBOL(init_level4_pgt);
-
-extern unsigned long __supported_pte_mask;
-EXPORT_SYMBOL(__supported_pte_mask);
-
-#ifdef CONFIG_SMP
-EXPORT_SYMBOL(flush_tlb_page);
-#endif
-
-EXPORT_SYMBOL(cpu_khz);
-
 EXPORT_SYMBOL(load_gs_index);
 
-- 
cgit v1.2.3


From 704fc59e1d056de80beaf30174bc8e0b1682efbb Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 26 Jun 2006 13:59:53 +0200
Subject: [PATCH] x86_64: fix apic error on bootup

Appended patch fixes the "APIC error on CPUX: 00(40)" observed during bootup.

From SDM Vol-3A "Valid Interrupt Vectors" section:
	"When an illegal vector value (0-15) is written to an LVT entry
	and the delivery mode is Fixed, the APIC may signal an illegal
	vector error, with out regard to whether the mask bit is set
	or whether an interrupt is actually seen on input."

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 396e125cb212..b2ead91df218 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -100,7 +100,7 @@ void clear_local_APIC(void)
 	maxlvt = get_maxlvt();
 
 	/*
-	 * Masking an LVT entry on a P6 can trigger a local APIC error
+	 * Masking an LVT entry can trigger a local APIC error
 	 * if the vector is zero. Mask LVTERR first to prevent this.
 	 */
 	if (maxlvt >= 3) {
@@ -851,7 +851,18 @@ void disable_APIC_timer(void)
 		unsigned long v;
 
 		v = apic_read(APIC_LVTT);
-		apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+		/*
+		 * When an illegal vector value (0-15) is written to an LVT
+		 * entry and delivery mode is Fixed, the APIC may signal an
+		 * illegal vector error, with out regard to whether the mask
+		 * bit is set or whether an interrupt is actually seen on input.
+		 *
+		 * Boot sequence might call this function when the LVTT has
+		 * '0' vector value. So make sure vector field is set to
+		 * valid value.
+		 */
+		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write(APIC_LVTT, v);
 	}
 }
 
-- 
cgit v1.2.3


From e77deacb7b078156fcadf27b838a4ce1a65eda04 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Mon, 26 Jun 2006 13:59:56 +0200
Subject: [PATCH] x86_64: Avoid broadcasting NMI IPIs

On some i386/x86_64 systems, sending an NMI IPI as a broadcast will
reset the system.  This seems to be a BIOS bug which affects machines
where one or more cpus are not under OS control.  It occurs on HT
systems with a version of the OS that is not compiled without HT
support.  It also occurs when a system is booted with max_cpus=n where
2 <= n < cpus known to the BIOS.  The fix is to always send NMI IPI as
a mask instead of as a broadcast.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/genapic_flat.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index c66ca7b1d31a..21c7066e236a 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -78,22 +78,29 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 
 static void flat_send_IPI_allbutself(int vector)
 {
-#ifndef CONFIG_HOTPLUG_CPU
-	if (((num_online_cpus()) - 1) >= 1)
-		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+#ifdef	CONFIG_HOTPLUG_CPU
+	int hotplug = 1;
 #else
-	cpumask_t allbutme = cpu_online_map;
+	int hotplug = 0;
+#endif
+	if (hotplug || vector == NMI_VECTOR) {
+		cpumask_t allbutme = cpu_online_map;
 
-	cpu_clear(smp_processor_id(), allbutme);
+		cpu_clear(smp_processor_id(), allbutme);
 
-	if (!cpus_empty(allbutme))
-		flat_send_IPI_mask(allbutme, vector);
-#endif
+		if (!cpus_empty(allbutme))
+			flat_send_IPI_mask(allbutme, vector);
+	} else if (num_online_cpus() > 1) {
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+	}
 }
 
 static void flat_send_IPI_all(int vector)
 {
-	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+	if (vector == NMI_VECTOR)
+		flat_send_IPI_mask(cpu_online_map, vector);
+	else
+		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
 
 static int flat_apic_id_registered(void)
-- 
cgit v1.2.3


From 0080e667550db5ae8c9318181500c413b99ff164 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Mon, 26 Jun 2006 13:59:59 +0200
Subject: [PATCH] x86_64: i386/x86-64 Add nmi watchdog support for new Intel
 CPUs

Intel now has support for Architectural Performance Monitoring Counters
( Refer to IA-32 Intel Architecture Software Developer's Manual
http://www.intel.com/design/pentium4/manuals/253669.htm ). This
feature is present starting from Intel Core Duo and Intel Core Solo processors.

What this means is, the performance monitoring counters and some performance
monitoring events are now defined in an architectural way (using cpuid).
And there will be no need to check for family/model etc for these architectural
events.

Below is the patch to use this performance counters in nmi watchdog driver.
Patch handles both i386 and x86-64 kernels.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/nmi.c   | 81 +++++++++++++++++++++++++++++++++++++++++++---
 arch/x86_64/kernel/setup.c |  7 ++++
 2 files changed, 83 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index ab421e22fa67..399489c93132 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -27,6 +27,7 @@
 #include <asm/proto.h>
 #include <asm/kdebug.h>
 #include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
 
 /*
  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -66,6 +67,9 @@ static unsigned int nmi_p4_cccr_val;
 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
 #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
 
+#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
 #define MSR_P4_MISC_ENABLE	0x1A0
 #define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
 #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL	(1<<12)
@@ -97,7 +101,10 @@ static __cpuinit inline int nmi_known_cpu(void)
 	case X86_VENDOR_AMD:
 		return boot_cpu_data.x86 == 15;
 	case X86_VENDOR_INTEL:
-		return boot_cpu_data.x86 == 15;
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return 1;
+		else
+			return (boot_cpu_data.x86 == 15);
 	}
 	return 0;
 }
@@ -203,6 +210,8 @@ int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
+static void disable_intel_arch_watchdog(void);
+
 static void disable_lapic_nmi_watchdog(void)
 {
 	if (nmi_active <= 0)
@@ -215,6 +224,8 @@ static void disable_lapic_nmi_watchdog(void)
 		if (boot_cpu_data.x86 == 15) {
 			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
 			wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+		} else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			disable_intel_arch_watchdog();
 		}
 		break;
 	}
@@ -367,6 +378,53 @@ static void setup_k7_watchdog(void)
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
 }
 
+static void disable_intel_arch_watchdog(void)
+{
+	unsigned ebx;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 */
+	ebx = cpuid_ebx(10);
+	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
+}
+
+static int setup_intel_arch_watchdog(void)
+{
+	unsigned int evntsel;
+	unsigned ebx;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 */
+	ebx = cpuid_ebx(10);
+	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		return 0;
+
+	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+
+	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
+	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
+
+	evntsel = ARCH_PERFMON_EVENTSEL_INT
+		| ARCH_PERFMON_EVENTSEL_OS
+		| ARCH_PERFMON_EVENTSEL_USR
+		| ARCH_PERFMON_NMI_EVENT_SEL
+		| ARCH_PERFMON_NMI_EVENT_UMASK;
+
+	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	return 1;
+}
+
 
 static int setup_p4_watchdog(void)
 {
@@ -420,10 +478,16 @@ void setup_apic_nmi_watchdog(void)
 		setup_k7_watchdog();
 		break;
 	case X86_VENDOR_INTEL:
-		if (boot_cpu_data.x86 != 15)
-			return;
-		if (!setup_p4_watchdog())
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			if (!setup_intel_arch_watchdog())
+				return;
+		} else if (boot_cpu_data.x86 == 15) {
+			if (!setup_p4_watchdog())
+				return;
+		} else {
 			return;
+		}
+
 		break;
 
 	default:
@@ -508,7 +572,14 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
  			 */
  			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
  			apic_write(APIC_LVTPC, APIC_DM_NMI);
- 		}
+ 		} else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+			/*
+			 * For Intel based architectural perfmon
+			 * - LVTPC is masked on interrupt and must be
+			 *   unmasked by the LVTPC handler.
+			 */
+			apic_write(APIC_LVTPC, APIC_DM_NMI);
+		}
 		wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
 	}
 }
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2a5fce0fd1c4..594642a6c037 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -988,6 +988,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	unsigned n;
 
 	init_intel_cacheinfo(c);
+	if (c->cpuid_level > 9 ) {
+		unsigned eax = cpuid_eax(10);
+		/* Check for version and the number of counters */
+		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+	}
+
 	n = c->extended_cpuid_level;
 	if (n >= 0x80000008) {
 		unsigned eax = cpuid_eax(0x80000008);
-- 
cgit v1.2.3


From 4961f10e2205d0ededa291e12ec634efc58aa93c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sgi.com>
Date: Mon, 26 Jun 2006 14:00:05 +0200
Subject: [PATCH] x86_64: (resend) x86_64 stack overflow debugging

Take two, now without spurious whitespace :(  Applies to git & 2.6.17-rc6

CONFIG_DEBUG_STACKOVERFLOW existed for x86_64 in 2.4, but seems to have gone AWOL in 2.6.

I've pretty much just copied this over from the 2.4 code, with
appropriate tweaks for the 2.6 kernel, plus a bugfix.  I'd personally
rather see it printed out the way other arches do it, i.e.
bytes-remaining-until-overflow, rather than having to do the subtraction
yourself.  Also, only 128 bytes remaining seems awfully late to issue a
warning.  But I'll start here :)

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/irq.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 207ecdc39822..59518d4d4358 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -26,6 +26,30 @@ atomic_t irq_mis_count;
 #endif
 #endif
 
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Only check the stack in process context, because everything else
+ * runs on the big interrupt stacks. Checking reliably is too expensive,
+ * so we just check from interrupts.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+	u64 curbase = (u64) current->thread_info;
+	static unsigned long warned = -60*HZ;
+
+	if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
+	    regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
+	    time_after(jiffies, warned + 60*HZ)) {
+		printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
+		       current->comm, curbase, regs->rsp);
+		show_stack(NULL,NULL);
+		warned = jiffies;
+	}
+}
+#endif
+
 /*
  * Generic, controller-independent functions:
  */
@@ -96,7 +120,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 
 	exit_idle();
 	irq_enter();
-
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	stack_overflow_check(regs);
+#endif
 	__do_IRQ(irq, regs);
 	irq_exit();
 
-- 
cgit v1.2.3