summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile16
-rw-r--r--arch/x86/mm/Makefile_329
-rw-r--r--arch/x86/mm/Makefile_649
-rw-r--r--arch/x86/mm/discontig_32.c8
-rw-r--r--arch/x86/mm/dump_pagetables.c354
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init_32.c105
-rw-r--r--arch/x86/mm/init_64.c299
-rw-r--r--arch/x86/mm/ioremap.c191
-rw-r--r--arch/x86/mm/k8topology_64.c2
-rw-r--r--arch/x86/mm/numa_64.c67
-rw-r--r--arch/x86/mm/pageattr.c111
-rw-r--r--arch/x86/mm/pat.c596
-rw-r--r--arch/x86/mm/pgtable.c276
-rw-r--r--arch/x86/mm/pgtable_32.c208
-rw-r--r--arch/x86/mm/srat_64.c41
16 files changed, 1866 insertions, 432 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 983291096848..b7b3e4c7cfc9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,17 @@
+obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+ pat.o pgtable.o
+
+obj-$(CONFIG_X86_32) += pgtable_32.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
+
+obj-$(CONFIG_HIGHMEM) += highmem_32.o
+
ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/mm/Makefile_32
+obj-$(CONFIG_NUMA) += discontig_32.o
else
-include ${srctree}/arch/x86/mm/Makefile_64
+obj-$(CONFIG_NUMA) += numa_64.o
+obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_ACPI_NUMA) += srat_64.o
endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
deleted file mode 100644
index c36ae88bb543..000000000000
--- a/arch/x86/mm/Makefile_32
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the linux i386-specific parts of the memory manager.
-#
-
-obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
-
-obj-$(CONFIG_NUMA) += discontig_32.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_HIGHMEM) += highmem_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
deleted file mode 100644
index 688c8c28ac8f..000000000000
--- a/arch/x86/mm/Makefile_64
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the linux x86_64-specific parts of the memory manager.
-#
-
-obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA) += numa_64.o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_64.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 8e25e06ff730..18378850e25a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -37,7 +37,7 @@
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
-#include <bios_ebda.h>
+#include <asm/bios_ebda.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void)
printk("NUMA - single node, flat memory mode\n");
/* Run the memory configuration and find the top of memory. */
- find_max_pfn();
+ propagate_e820_map();
node_start_pfn[0] = 0;
node_end_pfn[0] = max_pfn;
memory_present(0, 0, max_pfn);
@@ -134,7 +134,7 @@ int __init get_memcfg_numa_flat(void)
/*
* Find the highest page frame number we have available for the node
*/
-static void __init find_max_pfn_node(int nid)
+static void __init propagate_e820_map_node(int nid)
{
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
@@ -379,7 +379,7 @@ unsigned long __init setup_memory(void)
printk("High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
for_each_online_node(nid)
- find_max_pfn_node(nid);
+ propagate_e820_map_node(nid);
memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
NODE_DATA(0)->bdata = &node0_bdata;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
new file mode 100644
index 000000000000..2c24bea92c66
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables.c
@@ -0,0 +1,354 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+ int level;
+ pgprot_t current_prot;
+ unsigned long start_address;
+ unsigned long current_address;
+ const struct addr_marker *marker;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+ { 0, "User Space" },
+#ifdef CONFIG_X86_64
+ { 0x8000000000000000UL, "Kernel Space" },
+ { 0xffff810000000000UL, "Low Kernel Mapping" },
+ { VMALLOC_START, "vmalloc() Area" },
+ { VMEMMAP_START, "Vmemmap" },
+ { __START_KERNEL_map, "High Kernel Mapping" },
+ { MODULES_VADDR, "Modules" },
+ { MODULES_END, "End Modules" },
+#else
+ { PAGE_OFFSET, "Kernel Mapping" },
+ { 0/* VMALLOC_START */, "vmalloc() Area" },
+ { 0/*VMALLOC_END*/, "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+ { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+# endif
+ { 0/*FIXADDR_START*/, "Fixmap Area" },
+#endif
+ { -1, NULL } /* End of list */
+};
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+{
+ pgprotval_t pr = pgprot_val(prot);
+ static const char * const level_name[] =
+ { "cr3", "pgd", "pud", "pmd", "pte" };
+
+ if (!pgprot_val(prot)) {
+ /* Not present */
+ seq_printf(m, " ");
+ } else {
+ if (pr & _PAGE_USER)
+ seq_printf(m, "USR ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_RW)
+ seq_printf(m, "RW ");
+ else
+ seq_printf(m, "ro ");
+ if (pr & _PAGE_PWT)
+ seq_printf(m, "PWT ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_PCD)
+ seq_printf(m, "PCD ");
+ else
+ seq_printf(m, " ");
+
+ /* Bit 9 has a different meaning on level 3 vs 4 */
+ if (level <= 3) {
+ if (pr & _PAGE_PSE)
+ seq_printf(m, "PSE ");
+ else
+ seq_printf(m, " ");
+ } else {
+ if (pr & _PAGE_PAT)
+ seq_printf(m, "pat ");
+ else
+ seq_printf(m, " ");
+ }
+ if (pr & _PAGE_GLOBAL)
+ seq_printf(m, "GLB ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_NX)
+ seq_printf(m, "NX ");
+ else
+ seq_printf(m, "x ");
+ }
+ seq_printf(m, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+#ifdef CONFIG_X86_64
+ return (signed long)(u << 16) >> 16;
+#else
+ return u;
+#endif
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+ pgprot_t new_prot, int level)
+{
+ pgprotval_t prot, cur;
+ static const char units[] = "KMGTPE";
+
+ /*
+ * If we have a "break" in the series, we need to flush the state that
+ * we have now. "break" is either changing perms, levels or
+ * address space marker.
+ */
+ prot = pgprot_val(new_prot) & ~(PTE_MASK);
+ cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
+
+ if (!st->level) {
+ /* First entry */
+ st->current_prot = new_prot;
+ st->level = level;
+ st->marker = address_markers;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ } else if (prot != cur || level != st->level ||
+ st->current_address >= st->marker[1].start_address) {
+ const char *unit = units;
+ unsigned long delta;
+
+ /*
+ * Now print the actual finished series
+ */
+ seq_printf(m, "0x%p-0x%p ",
+ (void *)st->start_address,
+ (void *)st->current_address);
+
+ delta = (st->current_address - st->start_address) >> 10;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level);
+
+ /*
+ * We print markers for special areas of address space,
+ * such as the start of vmalloc space etc.
+ * This helps in the interpretation.
+ */
+ if (st->current_address >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ }
+
+ st->start_address = st->current_address;
+ st->current_prot = new_prot;
+ st->level = level;
+ }
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+ unsigned long P)
+{
+ int i;
+ pte_t *start;
+
+ start = (pte_t *) pmd_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pgprot_t prot = pte_pgprot(*start);
+
+ st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+ note_page(m, st, prot, 4);
+ start++;
+ }
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+ unsigned long P)
+{
+ int i;
+ pmd_t *start;
+
+ start = (pmd_t *) pud_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+ if (!pmd_none(*start)) {
+ pgprotval_t prot = pmd_val(*start) & ~PTE_MASK;
+
+ if (pmd_large(*start) || !pmd_present(*start))
+ note_page(m, st, __pgprot(prot), 3);
+ else
+ walk_pte_level(m, st, *start,
+ P + i * PMD_LEVEL_MULT);
+ } else
+ note_page(m, st, __pgprot(0), 3);
+ start++;
+ }
+}
+
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a) pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+ unsigned long P)
+{
+ int i;
+ pud_t *start;
+
+ start = (pud_t *) pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+ if (!pud_none(*start)) {
+ pgprotval_t prot = pud_val(*start) & ~PTE_MASK;
+
+ if (pud_large(*start) || !pud_present(*start))
+ note_page(m, st, __pgprot(prot), 2);
+ else
+ walk_pmd_level(m, st, *start,
+ P + i * PUD_LEVEL_MULT);
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+ pgd_t *start = swapper_pg_dir;
+#endif
+ int i;
+ struct pg_state st;
+
+ memset(&st, 0, sizeof(st));
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+ if (!pgd_none(*start)) {
+ pgprotval_t prot = pgd_val(*start) & ~PTE_MASK;
+
+ if (pgd_large(*start) || !pgd_present(*start))
+ note_page(m, &st, __pgprot(prot), 1);
+ else
+ walk_pud_level(m, &st, *start,
+ i * PGD_LEVEL_MULT);
+ } else
+ note_page(m, &st, __pgprot(0), 1);
+
+ start++;
+ }
+
+ /* Flush out the last page */
+ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+ note_page(m, &st, __pgprot(0), 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ walk_pgd_level(m);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pt_dump_init(void)
+{
+ struct dentry *pe;
+
+#ifdef CONFIG_X86_32
+ /* Not a compile-time constant on x86-32 */
+ address_markers[2].start_address = VMALLOC_START;
+ address_markers[3].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+ address_markers[4].start_address = PKMAP_BASE;
+ address_markers[5].start_address = FIXADDR_START;
+# else
+ address_markers[4].start_address = FIXADDR_START;
+# endif
+#endif
+
+ pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+ &ptdump_fops);
+ if (!pe)
+ return -ENOMEM;
+
+ return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ec08d8389850..fd7e1798c75a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -639,7 +639,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
#ifdef CONFIG_X86_32
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
fault has been handled. */
- if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+ if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
local_irq_enable();
/*
@@ -976,9 +976,5 @@ void vmalloc_sync_all(void)
if (address == start)
start = address + PGDIR_SIZE;
}
- /* Check that there is no need to do the same for the modules area. */
- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
- (__START_KERNEL & PGDIR_MASK)));
#endif
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ee1091a46964..de236e419cb5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1,5 +1,4 @@
/*
- * linux/arch/i386/mm/init.c
*
* Copyright (C) 1995 Linus Torvalds
*
@@ -51,6 +50,8 @@
unsigned int __VMALLOC_RESERVE = 128 << 20;
+unsigned long max_pfn_mapped;
+
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
@@ -70,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -99,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
}
- paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
@@ -179,8 +180,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
/*
* Map with big pages if possible, otherwise
* create normal page tables:
+ *
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
*/
- if (cpu_has_pse) {
+ if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
@@ -194,6 +200,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
+ max_pfn_mapped = pfn;
continue;
}
pte = one_page_table_init(pmd);
@@ -208,6 +215,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
set_pte(pte, pfn_pte(pfn, prot));
}
+ max_pfn_mapped = pfn;
}
}
}
@@ -219,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr)
return 0;
}
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
+
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
pgprot_t kmap_prot;
@@ -260,47 +287,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-static void __meminit free_new_highpage(struct page *page)
-{
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
-}
-
void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
{
if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
ClearPageReserved(page);
- free_new_highpage(page);
+ init_page_count(page);
+ __free_page(page);
+ totalhigh_pages++;
} else
SetPageReserved(page);
}
-static int __meminit
-add_one_highpage_hotplug(struct page *page, unsigned long pfn)
-{
- free_new_highpage(page);
- totalram_pages++;
-#ifdef CONFIG_FLATMEM
- max_mapnr = max(pfn, max_mapnr);
-#endif
- num_physpages++;
-
- return 0;
-}
-
-/*
- * Not currently handling the NUMA case.
- * Assuming single node and all memory that
- * has been added dynamically that would be
- * onlined here is in HIGHMEM.
- */
-void __meminit online_page(struct page *page)
-{
- ClearPageReserved(page);
- add_one_highpage_hotplug(page, page_to_pfn(page));
-}
-
#ifndef CONFIG_NUMA
static void __init set_highmem_pages_init(int bad_ppro)
{
@@ -357,7 +354,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
pte_clear(NULL, va, pte);
}
- paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
}
void __init native_pagetable_setup_done(pgd_t *base)
@@ -449,7 +446,7 @@ void zap_low_mappings(void)
* Note that "pgd_clear()" doesn't do it for
* us, because pgd_clear() is a no-op on i386.
*/
- for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
#ifdef CONFIG_X86_PAE
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
@@ -539,9 +536,9 @@ void __init paging_init(void)
/*
* Test if the WP bit works in supervisor mode. It isn't supported on 386's
- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
- * used to involve black magic jumps to work around some nasty CPU bugs,
- * but fortunately the switch to using exceptions got rid of all that.
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
*/
static void __init test_wp_bit(void)
{
@@ -723,25 +720,17 @@ void mark_rodata_ro(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
- /* It must still be possible to apply SMP alternatives. */
- if (num_possible_cpus() <= 1)
-#endif
- {
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
- size >> 10);
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ size >> 10);
#ifdef CONFIG_CPA_DEBUG
- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
- start, start+size);
- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+ start, start+size);
+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
- printk(KERN_INFO "Testing CPA: write protecting again\n");
- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
- }
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a02a14f0f324..32ba13b0f818 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -47,13 +47,30 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
-const struct dma_mapping_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+int direct_gbpages __meminitdata
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static int __init parse_direct_gbpages_off(char *arg)
+{
+ direct_gbpages = 0;
+ return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+ direct_gbpages = 1;
+ return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
* physical space so we can cache the place of the first one and move
@@ -69,9 +86,6 @@ void show_mem(void)
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
- printk(KERN_INFO "Free swap: %6ldkB\n",
- nr_swap_pages << (PAGE_SHIFT-10));
-
for_each_online_pgdat(pgdat) {
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
/*
@@ -121,7 +135,7 @@ static __init void *spp_getpage(void)
return ptr;
}
-static __init void
+static void
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
{
pgd_t *pgd;
@@ -159,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
pte = pte_offset_kernel(pmd, vaddr);
- if (!pte_none(*pte) &&
+ if (!pte_none(*pte) && pte_val(new_pte) &&
pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
pte_ERROR(*pte);
set_pte(pte, new_pte);
@@ -200,8 +214,7 @@ void __init cleanup_highmap(void)
}
/* NOTE: this is meant to be run only at boot */
-void __init
-__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
{
unsigned long address = __fix_to_virt(idx);
@@ -296,7 +309,7 @@ __meminit void early_iounmap(void *addr, unsigned long size)
__flush_tlb_all();
}
-static void __meminit
+static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
{
int i = pmd_index(address);
@@ -318,21 +331,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
set_pte((pte_t *)pmd,
pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
}
+ return address;
}
-static void __meminit
+static unsigned long __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, 0);
+ unsigned long last_map_addr;
+
spin_lock(&init_mm.page_table_lock);
- phys_pmd_init(pmd, address, end);
+ last_map_addr = phys_pmd_init(pmd, address, end);
spin_unlock(&init_mm.page_table_lock);
__flush_tlb_all();
+ return last_map_addr;
}
-static void __meminit
+static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
+ unsigned long last_map_addr = end;
int i = pud_index(addr);
for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
@@ -350,7 +368,15 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
}
if (pud_val(*pud)) {
- phys_pmd_update(pud, addr, end);
+ if (!pud_large(*pud))
+ last_map_addr = phys_pmd_update(pud, addr, end);
+ continue;
+ }
+
+ if (direct_gbpages) {
+ set_pte((pte_t *)pud,
+ pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
continue;
}
@@ -358,12 +384,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- phys_pmd_init(pmd, addr, end);
+ last_map_addr = phys_pmd_init(pmd, addr, end);
spin_unlock(&init_mm.page_table_lock);
unmap_low_page(pmd);
}
__flush_tlb_all();
+
+ return last_map_addr >> PAGE_SHIFT;
}
static void __init find_early_table_space(unsigned long end)
@@ -371,9 +399,11 @@ static void __init find_early_table_space(unsigned long end)
unsigned long puds, pmds, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
- round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
+ if (!direct_gbpages) {
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ }
/*
* RED-PEN putting page tables only on node 0 could
@@ -393,16 +423,135 @@ static void __init find_early_table_space(unsigned long end)
(table_start << PAGE_SHIFT) + tables);
}
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+
+#ifdef CONFIG_MEMTEST_BOOTPARAM
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+ unsigned pattern)
+{
+ unsigned long i;
+ unsigned long *start;
+ unsigned long start_bad;
+ unsigned long last_bad;
+ unsigned long val;
+ unsigned long start_phys_aligned;
+ unsigned long count;
+ unsigned long incr;
+
+ switch (pattern) {
+ case 0:
+ val = 0UL;
+ break;
+ case 1:
+ val = -1UL;
+ break;
+ case 2:
+ val = 0x5555555555555555UL;
+ break;
+ case 3:
+ val = 0xaaaaaaaaaaaaaaaaUL;
+ break;
+ default:
+ return;
+ }
+
+ incr = sizeof(unsigned long);
+ start_phys_aligned = ALIGN(start_phys, incr);
+ count = (size - (start_phys_aligned - start_phys))/incr;
+ start = __va(start_phys_aligned);
+ start_bad = 0;
+ last_bad = 0;
+
+ for (i = 0; i < count; i++)
+ start[i] = val;
+ for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+ if (*start != val) {
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ } else {
+ if (start_bad) {
+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
+ val, start_bad, last_bad + incr);
+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ }
+ start_bad = last_bad = start_phys_aligned;
+ }
+ }
+ }
+ if (start_bad) {
+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
+ val, start_bad, last_bad + incr);
+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ }
+
+}
+
+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+
+static int __init parse_memtest(char *arg)
+{
+ if (arg)
+ memtest_pattern = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+ unsigned long t_start, t_size;
+ unsigned pattern;
+
+ if (!memtest_pattern)
+ return;
+
+ printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+ for (pattern = 0; pattern < memtest_pattern; pattern++) {
+ t_start = start;
+ t_size = 0;
+ while (t_start < end) {
+ t_start = find_e820_area_size(t_start, &t_size, 1);
+
+ /* done ? */
+ if (t_start >= end)
+ break;
+ if (t_start + t_size > end)
+ t_size = end - t_start;
+
+ printk(KERN_CONT "\n %016lx - %016lx pattern %d",
+ t_start, t_start + t_size, pattern);
+
+ memtest(t_start, t_size, pattern);
+
+ t_start += t_size;
+ }
+ }
+ printk(KERN_CONT "\n");
+}
+#else
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
{
- unsigned long next;
+ unsigned long next, last_map_addr = end;
+ unsigned long start_phys = start, end_phys = end;
- pr_debug("init_memory_mapping\n");
+ printk(KERN_INFO "init_memory_mapping\n");
/*
* Find space for the kernel direct mapping tables.
@@ -411,8 +560,10 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
- if (!after_bootmem)
+ if (!after_bootmem) {
+ init_gbpages();
find_early_table_space(end);
+ }
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
@@ -430,7 +581,7 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
next = start + PGDIR_SIZE;
if (next > end)
next = end;
- phys_pud_init(pud, __pa(start), __pa(next));
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
if (!after_bootmem)
set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
unmap_low_page(pud);
@@ -443,6 +594,11 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
if (!after_bootmem)
reserve_early(table_start << PAGE_SHIFT,
table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start_phys, end_phys);
+
+ return last_map_addr;
}
#ifndef CONFIG_NUMA
@@ -464,15 +620,6 @@ void __init paging_init(void)
/*
* Memory hotplug specific functions
*/
-void online_page(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalram_pages++;
- num_physpages++;
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory is added always to NORMAL zone. This means you will never get
@@ -482,11 +629,13 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, start + size-1);
+ last_mapped_pfn = init_memory_mapping(start, start + size-1);
+ if (last_mapped_pfn > max_pfn_mapped)
+ max_pfn_mapped = last_mapped_pfn;
ret = __add_pages(zone, start_pfn, nr_pages);
WARN_ON(1);
@@ -505,6 +654,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
+
+
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
kcore_modules, kcore_vsyscall;
@@ -596,24 +765,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
void mark_rodata_ro(void)
{
- unsigned long start = (unsigned long)_stext, end;
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* It must still be possible to apply SMP alternatives. */
- if (num_possible_cpus() > 1)
- start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
- start = (unsigned long)__start_rodata;
-#endif
-
- end = (unsigned long)__end_rodata;
- start = (start + PAGE_SIZE - 1) & PAGE_MASK;
- end &= PAGE_MASK;
- if (end <= start)
- return;
-
+ unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -636,6 +788,7 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
}
+
#endif
#ifdef CONFIG_BLK_DEV_INITRD
@@ -648,7 +801,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
#ifdef CONFIG_NUMA
- int nid = phys_to_nid(phys);
+ int nid, next_nid;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
@@ -657,7 +810,7 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
* This can happen with kdump kernels when accessing
* firmware tables:
*/
- if (pfn < end_pfn_map)
+ if (pfn < max_pfn_mapped)
return;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
@@ -667,10 +820,16 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ nid = phys_to_nid(phys);
+ next_nid = phys_to_nid(phys + len - 1);
+ if (nid == next_nid)
+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ else
+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#else
reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#endif
+
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
@@ -764,6 +923,10 @@ const char *arch_vma_name(struct vm_area_struct *vma)
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
*/
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
{
@@ -798,12 +961,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
- printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
- addr, addr + PMD_SIZE - 1, p, node);
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
} else {
vmemmap_verify((pte_t *)pmd, node, addr, next);
}
}
return 0;
}
+
+void __meminit vmemmap_populate_print_last(void)
+{
+ if (p_start) {
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ p_start = NULL;
+ p_end = NULL;
+ node_start = 0;
+ }
+}
#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 794895c6dcc9..804de18abcc2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -19,11 +19,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-
-enum ioremap_mode {
- IOR_MODE_UNCACHED,
- IOR_MODE_CACHED,
-};
+#include <asm/pat.h>
#ifdef CONFIG_X86_64
@@ -35,11 +31,23 @@ unsigned long __phys_addr(unsigned long x)
}
EXPORT_SYMBOL(__phys_addr);
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
+#else
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return 1;
+}
+
#endif
int page_is_ram(unsigned long pagenr)
{
- unsigned long addr, end;
+ resource_size_t addr, end;
int i;
/*
@@ -78,19 +86,22 @@ int page_is_ram(unsigned long pagenr)
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
*/
-static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
- enum ioremap_mode mode)
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ unsigned long prot_val)
{
unsigned long nrpages = size >> PAGE_SHIFT;
int err;
- switch (mode) {
- case IOR_MODE_UNCACHED:
+ switch (prot_val) {
+ case _PAGE_CACHE_UC:
default:
- err = set_memory_uc(vaddr, nrpages);
+ err = _set_memory_uc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_WC:
+ err = _set_memory_wc(vaddr, nrpages);
break;
- case IOR_MODE_CACHED:
- err = set_memory_wb(vaddr, nrpages);
+ case _PAGE_CACHE_WB:
+ err = _set_memory_wb(vaddr, nrpages);
break;
}
@@ -106,18 +117,28 @@ static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
* have to convert them into an offset in a page-aligned mapping, but the
* caller shouldn't need to know that small detail.
*/
-static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
- enum ioremap_mode mode)
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, unsigned long prot_val, void *caller)
{
- unsigned long pfn, offset, last_addr, vaddr;
+ unsigned long pfn, offset, vaddr;
+ resource_size_t last_addr;
struct vm_struct *area;
+ unsigned long new_prot_val;
pgprot_t prot;
+ int retval;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
+ if (!phys_addr_valid(phys_addr)) {
+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+ (unsigned long long)phys_addr);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
@@ -127,25 +148,14 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
- (pfn << PAGE_SHIFT) < last_addr; pfn++) {
- if (page_is_ram(pfn) && pfn_valid(pfn) &&
- !PageReserved(pfn_to_page(pfn)))
- return NULL;
- }
+ for (pfn = phys_addr >> PAGE_SHIFT;
+ (pfn << PAGE_SHIFT) < last_addr; pfn++) {
- switch (mode) {
- case IOR_MODE_UNCACHED:
- default:
- /*
- * FIXME: we will use UC MINUS for now, as video fb drivers
- * depend on it. Upcoming ioremap_wc() will fix this behavior.
- */
- prot = PAGE_KERNEL_UC_MINUS;
- break;
- case IOR_MODE_CACHED:
- prot = PAGE_KERNEL;
- break;
+ int is_ram = page_is_ram(pfn);
+
+ if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+ return NULL;
+ WARN_ON_ONCE(is_ram);
}
/*
@@ -155,20 +165,66 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
+ retval = reserve_memtype(phys_addr, phys_addr + size,
+ prot_val, &new_prot_val);
+ if (retval) {
+ pr_debug("Warning: reserve_memtype returned %d\n", retval);
+ return NULL;
+ }
+
+ if (prot_val != new_prot_val) {
+ /*
+ * Do not fallback to certain memory types with certain
+ * requested type:
+ * - request is uncached, return cannot be write-back
+ * - request is uncached, return cannot be write-combine
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((prot_val == _PAGE_CACHE_UC &&
+ (new_prot_val == _PAGE_CACHE_WB ||
+ new_prot_val == _PAGE_CACHE_WC)) ||
+ (prot_val == _PAGE_CACHE_WC &&
+ new_prot_val == _PAGE_CACHE_WB)) {
+ pr_debug(
+ "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+ (unsigned long long)phys_addr,
+ (unsigned long long)(phys_addr + size),
+ prot_val, new_prot_val);
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
+ }
+ prot_val = new_prot_val;
+ }
+
+ switch (prot_val) {
+ case _PAGE_CACHE_UC:
+ default:
+ prot = PAGE_KERNEL_NOCACHE;
+ break;
+ case _PAGE_CACHE_WC:
+ prot = PAGE_KERNEL_WC;
+ break;
+ case _PAGE_CACHE_WB:
+ prot = PAGE_KERNEL;
+ break;
+ }
+
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
return NULL;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
+ free_memtype(phys_addr, phys_addr + size);
free_vm_area(area);
return NULL;
}
- if (ioremap_change_attr(vaddr, size, mode) < 0) {
+ if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+ free_memtype(phys_addr, phys_addr + size);
vunmap(area->addr);
return NULL;
}
@@ -199,13 +255,35 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
*/
void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
- return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_UC,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_nocache);
+/**
+ * ioremap_wc - map memory into CPU space write combined
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+ if (pat_wc_enabled)
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+ __builtin_return_address(0));
+ else
+ return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
- return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_cache);
@@ -252,6 +330,8 @@ void iounmap(volatile void __iomem *addr)
return;
}
+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
/* Finally remove it */
o = remove_vm_area((void *)addr);
BUG_ON(p != o || o == NULL);
@@ -259,6 +339,35 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+ void *addr;
+ unsigned long start = phys & PAGE_MASK;
+
+ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+ if (page_is_ram(start >> PAGE_SHIFT))
+ return __va(phys);
+
+ addr = (void *)ioremap(start, PAGE_SIZE);
+ if (addr)
+ addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+ return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+ if (page_is_ram(phys >> PAGE_SHIFT))
+ return;
+
+ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+ return;
+}
+
#ifdef CONFIG_X86_32
int __initdata early_ioremap_debug;
@@ -272,8 +381,8 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
- __attribute__((aligned(PAGE_SIZE)));
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+ __section(.bss.page_aligned);
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
@@ -330,7 +439,7 @@ void __init early_ioremap_clear(void)
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
pmd_clear(pmd);
- paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
+ paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
__flush_tlb_all();
}
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 7a2ebce87df5..86808e666f9c 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -164,7 +164,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
if (!found)
return -1;
- memnode_shift = compute_hash_shift(nodes, 8);
+ memnode_shift = compute_hash_shift(nodes, 8, NULL);
if (memnode_shift < 0) {
printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
return -1;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 16b82ad34b96..c5066d519e5d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,13 +31,15 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct memnode memnode;
+#ifdef CONFIG_SMP
int x86_cpu_to_node_map_init[NR_CPUS] = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
void *x86_cpu_to_node_map_early_ptr;
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+#endif
DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
@@ -58,7 +60,7 @@ unsigned long __initdata nodemap_size;
* -1 if node overlap or lost ram (shift too big)
*/
static int __init populate_memnodemap(const struct bootnode *nodes,
- int numnodes, int shift)
+ int numnodes, int shift, int *nodeids)
{
unsigned long addr, end;
int i, res = -1;
@@ -74,7 +76,12 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
do {
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
return -1;
- memnodemap[addr >> shift] = i;
+
+ if (!nodeids)
+ memnodemap[addr >> shift] = i;
+ else
+ memnodemap[addr >> shift] = nodeids[i];
+
addr += (1UL << shift);
} while (addr < end);
res = 1;
@@ -137,7 +144,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
return i;
}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
+ int *nodeids)
{
int shift;
@@ -147,7 +155,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
- if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+ if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
printk(KERN_INFO "Your memory is not aligned you need to "
"rebuild your kernel with a bigger NODEMAPSIZE "
"shift=%d\n", shift);
@@ -188,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+ int nid;
start = round_up(start, ZONE_ALIGN);
@@ -210,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
- /* Find a place for the bootmem map */
+ /*
+ * Find a place for the bootmem map
+ * nodedata_phys could be on other nodes by alloc_bootmem,
+ * so need to sure bootmap_start not to be small, otherwise
+ * early_node_mem will get that with find_e820_area instead
+ * of alloc_bootmem, that could clash with reserved range
+ */
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
- bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ nid = phys_to_nid(nodedata_phys);
+ if (nid == nodeid)
+ bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ else
+ bootmap_start = round_up(start, PAGE_SIZE);
/*
* SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
@@ -237,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
free_bootmem_with_active_regions(nodeid, end);
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size,
- BOOTMEM_DEFAULT);
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+ /*
+ * convert early reserve to bootmem reserve earlier
+ * otherwise early_node_mem could use early reserved mem
+ * on previous node
+ */
+ early_res_to_bootmem(start, end);
+
+ /*
+ * in some case early_node_mem could use alloc_bootmem
+ * to get range on other node, don't reserve that again
+ */
+ if (nid != nodeid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
+ else
+ reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
+ pgdat_size, BOOTMEM_DEFAULT);
+ nid = phys_to_nid(bootmap_start);
+ if (nid != nodeid)
+ printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
+ else
+ reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+ bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
#ifdef CONFIG_ACPI_NUMA
srat_reserve_add_area(nodeid);
#endif
@@ -378,9 +416,10 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
* Sets up the system RAM area from start_pfn to end_pfn according to the
* numa=fake command-line option.
*/
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+
static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
{
- struct bootnode nodes[MAX_NUMNODES];
u64 size, addr = start_pfn << PAGE_SHIFT;
u64 max_addr = end_pfn << PAGE_SHIFT;
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
@@ -460,7 +499,7 @@ done:
}
}
out:
- memnode_shift = compute_hash_shift(nodes, num_nodes);
+ memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
if (memnode_shift < 0) {
memnode_shift = 0;
printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
@@ -548,8 +587,6 @@ void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
- cpu_pda(cpu)->nodenumber = node;
-
if(cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else if(per_cpu_offset(cpu))
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7b79f6be4e7d..bd5e05c654dc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -9,6 +9,8 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -17,6 +19,7 @@
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
+#include <asm/pat.h>
/*
* The current flushing context - we pass it instead of 5 arguments:
@@ -28,6 +31,7 @@ struct cpa_data {
int numpages;
int flushtlb;
unsigned long pfn;
+ unsigned force_split : 1;
};
#ifdef CONFIG_X86_64
@@ -259,6 +263,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
int i, do_split = 1;
unsigned int level;
+ if (cpa->force_split)
+ return 1;
+
spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
@@ -476,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
goto out_unlock;
pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
#ifdef CONFIG_X86_64
@@ -535,7 +540,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
- return primary ? -EINVAL : 0;
+ return 0;
old_pte = *kpte;
if (!pte_val(old_pte)) {
@@ -693,7 +698,8 @@ static inline int cache_attr(pgprot_t attr)
}
static int change_page_attr_set_clr(unsigned long addr, int numpages,
- pgprot_t mask_set, pgprot_t mask_clr)
+ pgprot_t mask_set, pgprot_t mask_clr,
+ int force_split)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -704,7 +710,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
*/
mask_set = canon_pgprot(mask_set);
mask_clr = canon_pgprot(mask_clr);
- if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
return 0;
/* Ensure we are PAGE_SIZE aligned */
@@ -721,6 +727,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
cpa.flushtlb = 0;
+ cpa.force_split = force_split;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -759,26 +766,61 @@ out:
static inline int change_page_attr_set(unsigned long addr, int numpages,
pgprot_t mask)
{
- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
}
static inline int change_page_attr_clear(unsigned long addr, int numpages,
pgprot_t mask)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
}
-int set_memory_uc(unsigned long addr, int numpages)
+int _set_memory_uc(unsigned long addr, int numpages)
{
return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_PCD));
+ __pgprot(_PAGE_CACHE_UC));
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ _PAGE_CACHE_UC, NULL))
+ return -EINVAL;
+
+ return _set_memory_uc(addr, numpages);
}
EXPORT_SYMBOL(set_memory_uc);
-int set_memory_wb(unsigned long addr, int numpages)
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages,
+ __pgprot(_PAGE_CACHE_WC));
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+ if (!pat_wc_enabled)
+ return set_memory_uc(addr, numpages);
+
+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ _PAGE_CACHE_WC, NULL))
+ return -EINVAL;
+
+ return _set_memory_wc(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
{
return change_page_attr_clear(addr, numpages,
- __pgprot(_PAGE_PCD | _PAGE_PWT));
+ __pgprot(_PAGE_CACHE_MASK));
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+ free_memtype(addr, addr + numpages * PAGE_SIZE);
+
+ return _set_memory_wb(addr, numpages);
}
EXPORT_SYMBOL(set_memory_wb);
@@ -809,6 +851,12 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
}
+int set_memory_4k(unsigned long addr, int numpages)
+{
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+ __pgprot(0), 1);
+}
+
int set_pages_uc(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -918,6 +966,45 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
cpa_fill_pool(NULL);
}
+#ifdef CONFIG_DEBUG_FS
+static int dpa_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "DEBUG_PAGEALLOC\n");
+ seq_printf(m, "pool_size : %lu\n", pool_size);
+ seq_printf(m, "pool_pages : %lu\n", pool_pages);
+ seq_printf(m, "pool_low : %lu\n", pool_low);
+ seq_printf(m, "pool_used : %lu\n", pool_used);
+ seq_printf(m, "pool_failed : %lu\n", pool_failed);
+
+ return 0;
+}
+
+static int dpa_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, dpa_show, NULL);
+}
+
+static const struct file_operations dpa_fops = {
+ .open = dpa_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init debug_pagealloc_proc_init(void)
+{
+ struct dentry *de;
+
+ de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
+ &dpa_fops);
+ if (!de)
+ return -ENOMEM;
+
+ return 0;
+}
+__initcall(debug_pagealloc_proc_init);
+#endif
+
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
new file mode 100644
index 000000000000..277446cd30b6
--- /dev/null
+++ b/arch/x86/mm/pat.c
@@ -0,0 +1,596 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/bootmem.h>
+
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+#include <asm/e820.h>
+#include <asm/cacheflush.h>
+#include <asm/fcntl.h>
+#include <asm/mtrr.h>
+#include <asm/io.h>
+
+int pat_wc_enabled = 1;
+
+static u64 __read_mostly boot_pat_state;
+
+static int nopat(char *str)
+{
+ pat_wc_enabled = 0;
+ printk(KERN_INFO "x86: PAT support disabled.\n");
+
+ return 0;
+}
+early_param("nopat", nopat);
+
+static int pat_known_cpu(void)
+{
+ if (!pat_wc_enabled)
+ return 0;
+
+ if (cpu_has_pat)
+ return 1;
+
+ pat_wc_enabled = 0;
+ printk(KERN_INFO "CPU and/or kernel does not support PAT.\n");
+ return 0;
+}
+
+enum {
+ PAT_UC = 0, /* uncached */
+ PAT_WC = 1, /* Write combining */
+ PAT_WT = 4, /* Write Through */
+ PAT_WP = 5, /* Write Protected */
+ PAT_WB = 6, /* Write Back (default) */
+ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+ u64 pat;
+
+#ifndef CONFIG_X86_PAT
+ nopat(NULL);
+#endif
+
+ /* Boot CPU enables PAT based on CPU feature */
+ if (!smp_processor_id() && !pat_known_cpu())
+ return;
+
+ /* APs enable PAT iff boot CPU has enabled it before */
+ if (smp_processor_id() && !pat_wc_enabled)
+ return;
+
+ /* Set PWT to Write-Combining. All other bits stay the same */
+ /*
+ * PTE encoding used in Linux:
+ * PAT
+ * |PCD
+ * ||PWT
+ * |||
+ * 000 WB _PAGE_CACHE_WB
+ * 001 WC _PAGE_CACHE_WC
+ * 010 UC- _PAGE_CACHE_UC_MINUS
+ * 011 UC _PAGE_CACHE_UC
+ * PAT bit unused
+ */
+ pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
+ PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+
+ /* Boot CPU check */
+ if (!smp_processor_id()) {
+ rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+ }
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+ smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static char *cattr_name(unsigned long flags)
+{
+ switch (flags & _PAGE_CACHE_MASK) {
+ case _PAGE_CACHE_UC: return "uncached";
+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_WB: return "write-back";
+ case _PAGE_CACHE_WC: return "write-combining";
+ default: return "broken";
+ }
+}
+
+/*
+ * The global memtype list keeps track of memory type for specific
+ * physical memory areas. Conflicting memory types in different
+ * mappings can cause CPU cache corruption. To avoid this we keep track.
+ *
+ * The list is sorted based on starting address and can contain multiple
+ * entries for each address (this allows reference counting for overlapping
+ * areas). All the aliases have the same cache attributes of course.
+ * Zero attributes are represented as holes.
+ *
+ * Currently the data structure is a list because the number of mappings
+ * are expected to be relatively small. If this should be a problem
+ * it could be changed to a rbtree or similar.
+ *
+ * memtype_lock protects the whole list.
+ */
+
+struct memtype {
+ u64 start;
+ u64 end;
+ unsigned long type;
+ struct list_head nd;
+};
+
+static LIST_HEAD(memtype_list);
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
+ unsigned long *ret_prot)
+{
+ unsigned long pat_type;
+ u8 mtrr_type;
+
+ mtrr_type = mtrr_type_lookup(start, end);
+ if (mtrr_type == 0xFF) { /* MTRR not enabled */
+ *ret_prot = prot;
+ return 0;
+ }
+ if (mtrr_type == 0xFE) { /* MTRR match error */
+ *ret_prot = _PAGE_CACHE_UC;
+ return -1;
+ }
+ if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
+ mtrr_type != MTRR_TYPE_WRBACK &&
+ mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */
+ *ret_prot = _PAGE_CACHE_UC;
+ return -1;
+ }
+
+ pat_type = prot & _PAGE_CACHE_MASK;
+ prot &= (~_PAGE_CACHE_MASK);
+
+ /* Currently doing intersection by hand. Optimize it later. */
+ if (pat_type == _PAGE_CACHE_WC) {
+ *ret_prot = prot | _PAGE_CACHE_WC;
+ } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
+ *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
+ } else if (pat_type == _PAGE_CACHE_UC ||
+ mtrr_type == MTRR_TYPE_UNCACHABLE) {
+ *ret_prot = prot | _PAGE_CACHE_UC;
+ } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
+ *ret_prot = prot | _PAGE_CACHE_WC;
+ } else {
+ *ret_prot = prot | _PAGE_CACHE_WB;
+ }
+
+ return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * req_type will have a special case value '-1', when requester want to inherit
+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
+ *
+ * If ret_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If ret_type is non-null, function will return
+ * available type in ret_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+ unsigned long *ret_type)
+{
+ struct memtype *new_entry = NULL;
+ struct memtype *parse;
+ unsigned long actual_type;
+ int err = 0;
+
+ /* Only track when pat_wc_enabled */
+ if (!pat_wc_enabled) {
+ /* This is identical to page table setting without PAT */
+ if (ret_type) {
+ if (req_type == -1) {
+ *ret_type = _PAGE_CACHE_WB;
+ } else {
+ *ret_type = req_type;
+ }
+ }
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB in page table. No need to track */
+ if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
+ if (ret_type)
+ *ret_type = _PAGE_CACHE_WB;
+
+ return 0;
+ }
+
+ if (req_type == -1) {
+ /*
+ * Special case where caller wants to inherit from mtrr or
+ * existing pat mapping, defaulting to UC_MINUS in case of
+ * no match.
+ */
+ u8 mtrr_type = mtrr_type_lookup(start, end);
+ if (mtrr_type == 0xFE) { /* MTRR match error */
+ err = -1;
+ }
+
+ if (mtrr_type == MTRR_TYPE_WRBACK) {
+ req_type = _PAGE_CACHE_WB;
+ actual_type = _PAGE_CACHE_WB;
+ } else {
+ req_type = _PAGE_CACHE_UC_MINUS;
+ actual_type = _PAGE_CACHE_UC_MINUS;
+ }
+ } else {
+ req_type &= _PAGE_CACHE_MASK;
+ err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+ }
+
+ if (err) {
+ if (ret_type)
+ *ret_type = actual_type;
+
+ return -EINVAL;
+ }
+
+ new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+
+ new_entry->start = start;
+ new_entry->end = end;
+ new_entry->type = actual_type;
+
+ if (ret_type)
+ *ret_type = actual_type;
+
+ spin_lock(&memtype_lock);
+
+ /* Search for existing mapping that overlaps the current range */
+ list_for_each_entry(parse, &memtype_list, nd) {
+ struct memtype *saved_ptr;
+
+ if (parse->start >= end) {
+ pr_debug("New Entry\n");
+ list_add(&new_entry->nd, parse->nd.prev);
+ new_entry = NULL;
+ break;
+ }
+
+ if (start <= parse->start && end >= parse->start) {
+ if (actual_type != parse->type && ret_type) {
+ actual_type = parse->type;
+ *ret_type = actual_type;
+ new_entry->type = actual_type;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+
+ saved_ptr = parse;
+ /*
+ * Check to see whether the request overlaps more
+ * than one entry in the list
+ */
+ list_for_each_entry_continue(parse, &memtype_list, nd) {
+ if (end <= parse->start) {
+ break;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ if (err) {
+ break;
+ }
+
+ pr_debug("Overlap at 0x%Lx-0x%Lx\n",
+ saved_ptr->start, saved_ptr->end);
+ /* No conflict. Go ahead and add this new entry */
+ list_add(&new_entry->nd, saved_ptr->nd.prev);
+ new_entry = NULL;
+ break;
+ }
+
+ if (start < parse->end) {
+ if (actual_type != parse->type && ret_type) {
+ actual_type = parse->type;
+ *ret_type = actual_type;
+ new_entry->type = actual_type;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+
+ saved_ptr = parse;
+ /*
+ * Check to see whether the request overlaps more
+ * than one entry in the list
+ */
+ list_for_each_entry_continue(parse, &memtype_list, nd) {
+ if (end <= parse->start) {
+ break;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ if (err) {
+ break;
+ }
+
+ pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
+ saved_ptr->start, saved_ptr->end);
+ /* No conflict. Go ahead and add this new entry */
+ list_add(&new_entry->nd, &saved_ptr->nd);
+ new_entry = NULL;
+ break;
+ }
+ }
+
+ if (err) {
+ printk(KERN_INFO
+ "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
+ start, end, cattr_name(new_entry->type),
+ cattr_name(req_type));
+ kfree(new_entry);
+ spin_unlock(&memtype_lock);
+ return err;
+ }
+
+ if (new_entry) {
+ /* No conflict. Not yet added to the list. Add to the tail */
+ list_add_tail(&new_entry->nd, &memtype_list);
+ pr_debug("New Entry\n");
+ }
+
+ if (ret_type) {
+ pr_debug(
+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+ start, end, cattr_name(actual_type),
+ cattr_name(req_type), cattr_name(*ret_type));
+ } else {
+ pr_debug(
+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
+ start, end, cattr_name(actual_type),
+ cattr_name(req_type));
+ }
+
+ spin_unlock(&memtype_lock);
+ return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+ struct memtype *ml;
+ int err = -EINVAL;
+
+ /* Only track when pat_wc_enabled */
+ if (!pat_wc_enabled) {
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB. No need to track */
+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+ return 0;
+ }
+
+ spin_lock(&memtype_lock);
+ list_for_each_entry(ml, &memtype_list, nd) {
+ if (ml->start == start && ml->end == end) {
+ list_del(&ml->nd);
+ kfree(ml);
+ err = 0;
+ break;
+ }
+ }
+ spin_unlock(&memtype_lock);
+
+ if (err) {
+ printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
+ current->comm, current->pid, start, end);
+ }
+
+ pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ return err;
+}
+
+
+/*
+ * /dev/mem mmap interface. The memtype used for mapping varies:
+ * - Use UC for mappings with O_SYNC flag
+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
+ * inherit the memtype from existing mapping.
+ * - Else use UC_MINUS memtype (for backward compatibility with existing
+ * X drivers.
+ */
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot)
+{
+ return vma_prot;
+}
+
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ u64 from = ((u64)pfn) << PAGE_SHIFT;
+ u64 to = from + size;
+ u64 cursor = from;
+
+ while (cursor < to) {
+ if (!devmem_is_allowed(pfn)) {
+ printk(KERN_INFO
+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+ current->comm, from, to);
+ return 0;
+ }
+ cursor += PAGE_SIZE;
+ pfn++;
+ }
+ return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot)
+{
+ u64 offset = ((u64) pfn) << PAGE_SHIFT;
+ unsigned long flags = _PAGE_CACHE_UC_MINUS;
+ int retval;
+
+ if (!range_is_allowed(pfn, size))
+ return 0;
+
+ if (file->f_flags & O_SYNC) {
+ flags = _PAGE_CACHE_UC;
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * On the PPro and successors, the MTRRs are used to set
+ * memory types for physical addresses outside main memory,
+ * so blindly setting UC or PWT on those pages is wrong.
+ * For Pentiums and earlier, the surround logic should disable
+ * caching for the high addresses through the KEN pin, but
+ * we maintain the tradition of paranoia in this code.
+ */
+ if (!pat_wc_enabled &&
+ ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+ flags = _PAGE_CACHE_UC;
+ }
+#endif
+
+ /*
+ * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+ * Without O_SYNC, we want to get
+ * - WB for WB-able memory and no other conflicting mappings
+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+ * - Inherit from confliting mappings otherwise
+ */
+ if (flags != _PAGE_CACHE_UC_MINUS) {
+ retval = reserve_memtype(offset, offset + size, flags, NULL);
+ } else {
+ retval = reserve_memtype(offset, offset + size, -1, &flags);
+ }
+
+ if (retval < 0)
+ return 0;
+
+ if (pfn <= max_pfn_mapped &&
+ ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+ free_memtype(offset, offset + size);
+ printk(KERN_INFO
+ "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ offset, offset + size);
+ return 0;
+ }
+
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+ flags);
+ return 1;
+}
+
+void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+ u64 addr = (u64)pfn << PAGE_SHIFT;
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+ reserve_memtype(addr, addr + size, want_flags, &flags);
+ if (flags != want_flags) {
+ printk(KERN_INFO
+ "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ addr, addr + size,
+ cattr_name(flags));
+ }
+}
+
+void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+ u64 addr = (u64)pfn << PAGE_SHIFT;
+
+ free_memtype(addr, addr + size);
+}
+
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000000..50159764f694
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,276 @@
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+ if (pte)
+ pgtable_page_ctor(pte);
+ return pte;
+}
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pte(page_to_pfn(pte));
+ tlb_remove_page(tlb, pte);
+}
+
+#if PAGETABLE_LEVELS > 2
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+static void pgd_ctor(void *p)
+{
+ pgd_t *pgd = p;
+ unsigned long flags;
+
+ /* Clear usermode parts of PGD */
+ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ /* If the pgd points to a shared pagetable level (either the
+ ptes in non-PAE, or shared PMD in PAE), then just copy the
+ references from swapper_pg_dir. */
+ if (PAGETABLE_LEVELS == 2 ||
+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ PAGETABLE_LEVELS == 4) {
+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ }
+
+ /* list required to sync kernel mapping updates */
+ if (!SHARED_KERNEL_PMD)
+ pgd_list_add(pgd);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void pgd_dtor(void *pgd)
+{
+ unsigned long flags; /* can be called from interrupt context */
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+ pgd_t pgd = pgdp[i];
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = native_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ }
+ }
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ pud_t *pud;
+ unsigned long addr;
+ int i;
+
+ pud = pud_offset(pgd, 0);
+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+ i++, pud++, addr += PUD_SIZE) {
+ pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+ if (!pmd) {
+ pgd_mop_up_pmds(mm, pgd);
+ return 0;
+ }
+
+ if (i >= KERNEL_PGD_BOUNDARY)
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, pud, pmd);
+ }
+
+ return 1;
+}
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ if (mm == current->active_mm)
+ write_cr3(read_cr3());
+}
+#else /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ return 1;
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif /* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+ /* so that alloc_pmd can use it */
+ mm->pgd = pgd;
+ if (pgd)
+ pgd_ctor(pgd);
+
+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
+ pgd = NULL;
+ }
+
+ return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
+}
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty) {
+ *ptep = entry;
+ pte_update_defer(vma->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ }
+
+ return changed;
+}
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ &ptep->pte);
+
+ if (ret)
+ pte_update(vma->vm_mm, addr, ptep);
+
+ return ret;
+}
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+
+ return young;
+}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 2f9e9afcb9f4..9ee007be9142 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -1,7 +1,3 @@
-/*
- * linux/arch/i386/mm/pgtable.c
- */
-
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -36,7 +32,6 @@ void show_mem(void)
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_online_pgdat(pgdat) {
pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
@@ -178,206 +173,9 @@ void reserve_top_address(unsigned long reserve)
__VMALLOC_RESERVE += reserve;
}
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+int pmd_bad(pmd_t pmd)
{
- struct page *pte;
+ WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
-#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
- if (pte)
- pgtable_page_ctor(pte);
- return pte;
+ return pmd_bad_v1(pmd);
}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-static inline void pgd_list_add(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_del(&page->lru);
-}
-
-#define UNSHARED_PTRS_PER_PGD \
- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-static void pgd_ctor(void *p)
-{
- pgd_t *pgd = p;
- unsigned long flags;
-
- /* Clear usermode parts of PGD */
- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
- spin_lock_irqsave(&pgd_lock, flags);
-
- /* If the pgd points to a shared pagetable level (either the
- ptes in non-PAE, or shared PMD in PAE), then just copy the
- references from swapper_pg_dir. */
- if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
- swapper_pg_dir + USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
- }
-
- /* list required to sync kernel mapping updates */
- if (!SHARED_KERNEL_PMD)
- pgd_list_add(pgd);
-
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static void pgd_dtor(void *pgd)
-{
- unsigned long flags; /* can be called from interrupt context */
-
- if (SHARED_KERNEL_PMD)
- return;
-
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_del(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
- int i;
-
- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
- pgd_t pgd = pgdp[i];
-
- if (pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
- pgdp[i] = native_make_pgd(0);
-
- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- }
- }
-}
-
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update. Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- pud_t *pud;
- unsigned long addr;
- int i;
-
- pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
- i++, pud++, addr += PUD_SIZE) {
- pmd_t *pmd = pmd_alloc_one(mm, addr);
-
- if (!pmd) {
- pgd_mop_up_pmds(mm, pgd);
- return 0;
- }
-
- if (i >= USER_PTRS_PER_PGD)
- memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
- sizeof(pmd_t) * PTRS_PER_PMD);
-
- pud_populate(mm, pud, pmd);
- }
-
- return 1;
-}
-#else /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- return 1;
-}
-
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-}
-#endif /* CONFIG_X86_PAE */
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-
- /* so that alloc_pd can use it */
- mm->pgd = pgd;
- if (pgd)
- pgd_ctor(pgd);
-
- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
- pgd = NULL;
- }
-
- return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- pgd_mop_up_pmds(mm, pgd);
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
-}
-
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
- pgtable_page_dtor(pte);
- paravirt_release_pt(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
-}
-
-#ifdef CONFIG_X86_PAE
-
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
-#endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 845001c617cc..3890234e5b26 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,6 +20,7 @@
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
+#include <asm/genapic.h>
int acpi_numa __initdata;
@@ -31,6 +32,10 @@ static struct bootnode nodes_add[MAX_NUMNODES];
static int found_add_area __initdata;
int hotadd_percent __initdata = 0;
+static int num_node_memblks __initdata;
+static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
+static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
+
/* Too small nodes confuse the VM badly. Usually they result
from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)
@@ -40,17 +45,17 @@ static __init int setup_node(int pxm)
return acpi_map_pxm_to_node(pxm);
}
-static __init int conflicting_nodes(unsigned long start, unsigned long end)
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
{
int i;
- for_each_node_mask(i, nodes_parsed) {
- struct bootnode *nd = &nodes[i];
+ for (i = 0; i < num_node_memblks; i++) {
+ struct bootnode *nd = &node_memblk_range[i];
if (nd->start == nd->end)
continue;
if (nd->end > start && nd->start < end)
- return i;
+ return memblk_nodeid[i];
if (nd->end == end && nd->start == start)
- return i;
+ return memblk_nodeid[i];
}
return -1;
}
@@ -132,7 +137,6 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
int pxm, node;
int apic_id;
- apic_id = pa->apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -148,13 +152,18 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
bad_srat();
return;
}
+
+ if (is_uv_system())
+ apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+ else
+ apic_id = pa->apic_id;
apicid_to_node[apic_id] = node;
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, apic_id, node);
}
-int update_end_of_memory(unsigned long end) {return -1;}
+static int update_end_of_memory(unsigned long end) {return -1;}
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
@@ -253,7 +262,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
bad_srat();
return;
}
- i = conflicting_nodes(start, end);
+ i = conflicting_memblks(start, end);
if (i == node) {
printk(KERN_WARNING
"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
@@ -278,10 +287,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
nd->end = end;
}
- printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
- nd->start, nd->end);
- e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
- nd->end >> PAGE_SHIFT);
+ printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+ start, end);
+ e820_register_active_regions(node, start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
push_node_boundaries(node, nd->start >> PAGE_SHIFT,
nd->end >> PAGE_SHIFT);
@@ -293,6 +302,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
if ((nd->start | nd->end) == 0)
node_clear(node, nodes_parsed);
}
+
+ node_memblk_range[num_node_memblks].start = start;
+ node_memblk_range[num_node_memblks].end = end;
+ memblk_nodeid[num_node_memblks] = node;
+ num_node_memblks++;
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -363,7 +377,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
}
- memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+ memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+ memblk_nodeid);
if (memnode_shift < 0) {
printk(KERN_ERR
"SRAT: No NUMA node hash function found. Contact maintainer\n");