diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 16 | ||||
-rw-r--r-- | arch/x86/mm/Makefile_32 | 9 | ||||
-rw-r--r-- | arch/x86/mm/Makefile_64 | 9 | ||||
-rw-r--r-- | arch/x86/mm/discontig_32.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 354 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 105 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 299 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 191 | ||||
-rw-r--r-- | arch/x86/mm/k8topology_64.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 67 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 111 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 596 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 276 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 208 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 41 |
16 files changed, 1866 insertions, 432 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 983291096848..b7b3e4c7cfc9 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,17 @@ +obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ + pat.o pgtable.o + +obj-$(CONFIG_X86_32) += pgtable_32.o + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o + +obj-$(CONFIG_HIGHMEM) += highmem_32.o + ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/mm/Makefile_32 +obj-$(CONFIG_NUMA) += discontig_32.o else -include ${srctree}/arch/x86/mm/Makefile_64 +obj-$(CONFIG_NUMA) += numa_64.o +obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_ACPI_NUMA) += srat_64.o endif diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32 deleted file mode 100644 index c36ae88bb543..000000000000 --- a/arch/x86/mm/Makefile_32 +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for the linux i386-specific parts of the memory manager. -# - -obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o - -obj-$(CONFIG_NUMA) += discontig_32.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 deleted file mode 100644 index 688c8c28ac8f..000000000000 --- a/arch/x86/mm/Makefile_64 +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for the linux x86_64-specific parts of the memory manager. -# - -obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_NUMA) += numa_64.o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_64.o diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 8e25e06ff730..18378850e25a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -37,7 +37,7 @@ #include <asm/e820.h> #include <asm/setup.h> #include <asm/mmzone.h> -#include <bios_ebda.h> +#include <asm/bios_ebda.h> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void) printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); + propagate_e820_map(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); @@ -134,7 +134,7 @@ int __init get_memcfg_numa_flat(void) /* * Find the highest page frame number we have available for the node */ -static void __init find_max_pfn_node(int nid) +static void __init propagate_e820_map_node(int nid) { if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -379,7 +379,7 @@ unsigned long __init setup_memory(void) printk("High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); for_each_online_node(nid) - find_max_pfn_node(nid); + propagate_e820_map_node(nid); memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); NODE_DATA(0)->bdata = &node0_bdata; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c new file mode 100644 index 000000000000..2c24bea92c66 --- /dev/null +++ b/arch/x86/mm/dump_pagetables.c @@ -0,0 +1,354 @@ +/* + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/debugfs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/seq_file.h> + +#include <asm/pgtable.h> + +/* + * The dumper groups pagetable entries of the same type into one, and for + * that it needs to keep some state when walking, and flush this state + * when a "break" in the continuity is found. + */ +struct pg_state { + int level; + pgprot_t current_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +/* Address space markers hints */ +static struct addr_marker address_markers[] = { + { 0, "User Space" }, +#ifdef CONFIG_X86_64 + { 0x8000000000000000UL, "Kernel Space" }, + { 0xffff810000000000UL, "Low Kernel Mapping" }, + { VMALLOC_START, "vmalloc() Area" }, + { VMEMMAP_START, "Vmemmap" }, + { __START_KERNEL_map, "High Kernel Mapping" }, + { MODULES_VADDR, "Modules" }, + { MODULES_END, "End Modules" }, +#else + { PAGE_OFFSET, "Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/*VMALLOC_END*/, "vmalloc() End" }, +# ifdef CONFIG_HIGHMEM + { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, +# endif + { 0/*FIXADDR_START*/, "Fixmap Area" }, +#endif + { -1, NULL } /* End of list */ +}; + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) + +/* + * Print a readable form of a pgprot_t to the seq_file + */ +static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +{ + pgprotval_t pr = pgprot_val(prot); + static const char * const level_name[] = + { "cr3", "pgd", "pud", "pmd", "pte" }; + + if (!pgprot_val(prot)) { + /* Not present */ + seq_printf(m, " "); + } else { + if (pr & _PAGE_USER) + seq_printf(m, "USR "); + else + seq_printf(m, " "); + if (pr & _PAGE_RW) + seq_printf(m, "RW "); + else + seq_printf(m, "ro "); + if (pr & _PAGE_PWT) + seq_printf(m, "PWT "); + else + seq_printf(m, " "); + if (pr & _PAGE_PCD) + seq_printf(m, "PCD "); + else + seq_printf(m, " "); + + /* Bit 9 has a different meaning on level 3 vs 4 */ + if (level <= 3) { + if (pr & _PAGE_PSE) + seq_printf(m, "PSE "); + else + seq_printf(m, " "); + } else { + if (pr & _PAGE_PAT) + seq_printf(m, "pat "); + else + seq_printf(m, " "); + } + if (pr & _PAGE_GLOBAL) + seq_printf(m, "GLB "); + else + seq_printf(m, " "); + if (pr & _PAGE_NX) + seq_printf(m, "NX "); + else + seq_printf(m, "x "); + } + seq_printf(m, "%s\n", level_name[level]); +} + +/* + * On 64 bits, sign-extend the 48 bit address to 64 bit + */ +static unsigned long normalize_addr(unsigned long u) +{ +#ifdef CONFIG_X86_64 + return (signed long)(u << 16) >> 16; +#else + return u; +#endif +} + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(struct seq_file *m, struct pg_state *st, + pgprot_t new_prot, int level) +{ + pgprotval_t prot, cur; + static const char units[] = "KMGTPE"; + + /* + * If we have a "break" in the series, we need to flush the state that + * we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = pgprot_val(new_prot) & ~(PTE_MASK); + cur = pgprot_val(st->current_prot) & ~(PTE_MASK); + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { + const char *unit = units; + unsigned long delta; + + /* + * Now print the actual finished series + */ + seq_printf(m, "0x%p-0x%p ", + (void *)st->start_address, + (void *)st->current_address); + + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + printk_prot(m, st->current_prot, st->level); + + /* + * We print markers for special areas of address space, + * such as the start of vmalloc space etc. + * This helps in the interpretation. + */ + if (st->current_address >= st->marker[1].start_address) { + st->marker++; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + + st->start_address = st->current_address; + st->current_prot = new_prot; + st->level = level; + } +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + unsigned long P) +{ + int i; + pte_t *start; + + start = (pte_t *) pmd_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PTE; i++) { + pgprot_t prot = pte_pgprot(*start); + + st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); + note_page(m, st, prot, 4); + start++; + } +} + +#if PTRS_PER_PMD > 1 + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + unsigned long P) +{ + int i; + pmd_t *start; + + start = (pmd_t *) pud_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PMD; i++) { + st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); + if (!pmd_none(*start)) { + pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; + + if (pmd_large(*start) || !pmd_present(*start)) + note_page(m, st, __pgprot(prot), 3); + else + walk_pte_level(m, st, *start, + P + i * PMD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 3); + start++; + } +} + +#else +#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define pud_large(a) pmd_large(__pmd(pud_val(a))) +#define pud_none(a) pmd_none(__pmd(pud_val(a))) +#endif + +#if PTRS_PER_PUD > 1 + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + unsigned long P) +{ + int i; + pud_t *start; + + start = (pud_t *) pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_PUD; i++) { + st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); + if (!pud_none(*start)) { + pgprotval_t prot = pud_val(*start) & ~PTE_MASK; + + if (pud_large(*start) || !pud_present(*start)) + note_page(m, st, __pgprot(prot), 2); + else + walk_pmd_level(m, st, *start, + P + i * PUD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) +#define pgd_large(a) pud_large(__pud(pgd_val(a))) +#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#endif + +static void walk_pgd_level(struct seq_file *m) +{ +#ifdef CONFIG_X86_64 + pgd_t *start = (pgd_t *) &init_level4_pgt; +#else + pgd_t *start = swapper_pg_dir; +#endif + int i; + struct pg_state st; + + memset(&st, 0, sizeof(st)); + + for (i = 0; i < PTRS_PER_PGD; i++) { + st.current_address = normalize_addr(i * PGD_LEVEL_MULT); + if (!pgd_none(*start)) { + pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; + + if (pgd_large(*start) || !pgd_present(*start)) + note_page(m, &st, __pgprot(prot), 1); + else + walk_pud_level(m, &st, *start, + i * PGD_LEVEL_MULT); + } else + note_page(m, &st, __pgprot(0), 1); + + start++; + } + + /* Flush out the last page */ + st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); + note_page(m, &st, __pgprot(0), 0); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + walk_pgd_level(m); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int pt_dump_init(void) +{ + struct dentry *pe; + +#ifdef CONFIG_X86_32 + /* Not a compile-time constant on x86-32 */ + address_markers[2].start_address = VMALLOC_START; + address_markers[3].start_address = VMALLOC_END; +# ifdef CONFIG_HIGHMEM + address_markers[4].start_address = PKMAP_BASE; + address_markers[5].start_address = FIXADDR_START; +# else + address_markers[4].start_address = FIXADDR_START; +# endif +#endif + + pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +__initcall(pt_dump_init); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ec08d8389850..fd7e1798c75a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -639,7 +639,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_32 /* It's safe to allow irq's after cr2 has been saved and the vmalloc fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) local_irq_enable(); /* @@ -976,9 +976,5 @@ void vmalloc_sync_all(void) if (address == start) start = address + PGDIR_SIZE; } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); #endif } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ee1091a46964..de236e419cb5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1,5 +1,4 @@ /* - * linux/arch/i386/mm/init.c * * Copyright (C) 1995 Linus Torvalds * @@ -51,6 +50,8 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; +unsigned long max_pfn_mapped; + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; @@ -70,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); @@ -99,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); } - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -179,8 +180,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) /* * Map with big pages if possible, otherwise * create normal page tables: + * + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. */ - if (cpu_has_pse) { + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; @@ -194,6 +200,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; + max_pfn_mapped = pfn; continue; } pte = one_page_table_init(pmd); @@ -208,6 +215,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) set_pte(pte, pfn_pte(pfn, prot)); } + max_pfn_mapped = pfn; } } } @@ -219,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; @@ -260,47 +287,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __meminit free_new_highpage(struct page *page) -{ - init_page_count(page); - __free_page(page); - totalhigh_pages++; -} - void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - free_new_highpage(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; } else SetPageReserved(page); } -static int __meminit -add_one_highpage_hotplug(struct page *page, unsigned long pfn) -{ - free_new_highpage(page); - totalram_pages++; -#ifdef CONFIG_FLATMEM - max_mapnr = max(pfn, max_mapnr); -#endif - num_physpages++; - - return 0; -} - -/* - * Not currently handling the NUMA case. - * Assuming single node and all memory that - * has been added dynamically that would be - * onlined here is in HIGHMEM. - */ -void __meminit online_page(struct page *page) -{ - ClearPageReserved(page); - add_one_highpage_hotplug(page, page_to_pfn(page)); -} - #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { @@ -357,7 +354,7 @@ void __init native_pagetable_setup_start(pgd_t *base) pte_clear(NULL, va, pte); } - paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); } void __init native_pagetable_setup_done(pgd_t *base) @@ -449,7 +446,7 @@ void zap_low_mappings(void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) { + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else @@ -539,9 +536,9 @@ void __init paging_init(void) /* * Test if the WP bit works in supervisor mode. It isn't supported on 386's - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This - * used to involve black magic jumps to work around some nasty CPU bugs, - * but fortunately the switch to using exceptions got rid of all that. + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. */ static void __init test_wp_bit(void) { @@ -723,25 +720,17 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() <= 1) -#endif - { - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); - printk(KERN_INFO "Testing CPA: write protecting again\n"); - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); -#endif - } + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif start += size; size = (unsigned long)__end_rodata - start; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a02a14f0f324..32ba13b0f818 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -47,13 +47,30 @@ #include <asm/numa.h> #include <asm/cacheflush.h> -const struct dma_mapping_ops *dma_ops; -EXPORT_SYMBOL(dma_ops); - static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +int direct_gbpages __meminitdata +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static int __init parse_direct_gbpages_off(char *arg) +{ + direct_gbpages = 0; + return 0; +} +early_param("nogbpages", parse_direct_gbpages_off); + +static int __init parse_direct_gbpages_on(char *arg) +{ + direct_gbpages = 1; + return 0; +} +early_param("gbpages", parse_direct_gbpages_on); + /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -69,9 +86,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", - nr_swap_pages << (PAGE_SHIFT-10)); - for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { /* @@ -121,7 +135,7 @@ static __init void *spp_getpage(void) return ptr; } -static __init void +static void set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; @@ -159,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && + if (!pte_none(*pte) && pte_val(new_pte) && pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) pte_ERROR(*pte); set_pte(pte, new_pte); @@ -200,8 +214,7 @@ void __init cleanup_highmap(void) } /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); @@ -296,7 +309,7 @@ __meminit void early_iounmap(void *addr, unsigned long size) __flush_tlb_all(); } -static void __meminit +static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { int i = pmd_index(address); @@ -318,21 +331,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } + return address; } -static void __meminit +static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { pmd_t *pmd = pmd_offset(pud, 0); + unsigned long last_map_addr; + spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); + last_map_addr = phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); + return last_map_addr; } -static void __meminit +static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { + unsigned long last_map_addr = end; int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { @@ -350,7 +368,15 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } if (pud_val(*pud)) { - phys_pmd_update(pud, addr, end); + if (!pud_large(*pud)) + last_map_addr = phys_pmd_update(pud, addr, end); + continue; + } + + if (direct_gbpages) { + set_pte((pte_t *)pud, + pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + last_map_addr = (addr & PUD_MASK) + PUD_SIZE; continue; } @@ -358,12 +384,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, addr, end); + last_map_addr = phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); unmap_low_page(pmd); } __flush_tlb_all(); + + return last_map_addr >> PAGE_SHIFT; } static void __init find_early_table_space(unsigned long end) @@ -371,9 +399,11 @@ static void __init find_early_table_space(unsigned long end) unsigned long puds, pmds, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + - round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); + if (!direct_gbpages) { + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + } /* * RED-PEN putting page tables only on node 0 could @@ -393,16 +423,135 @@ static void __init find_early_table_space(unsigned long end) (table_start << PAGE_SHIFT) + tables); } +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} + +#ifdef CONFIG_MEMTEST_BOOTPARAM + +static void __init memtest(unsigned long start_phys, unsigned long size, + unsigned pattern) +{ + unsigned long i; + unsigned long *start; + unsigned long start_bad; + unsigned long last_bad; + unsigned long val; + unsigned long start_phys_aligned; + unsigned long count; + unsigned long incr; + + switch (pattern) { + case 0: + val = 0UL; + break; + case 1: + val = -1UL; + break; + case 2: + val = 0x5555555555555555UL; + break; + case 3: + val = 0xaaaaaaaaaaaaaaaaUL; + break; + default: + return; + } + + incr = sizeof(unsigned long); + start_phys_aligned = ALIGN(start_phys, incr); + count = (size - (start_phys_aligned - start_phys))/incr; + start = __va(start_phys_aligned); + start_bad = 0; + last_bad = 0; + + for (i = 0; i < count; i++) + start[i] = val; + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { + if (*start != val) { + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + } else { + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + start_bad = last_bad = start_phys_aligned; + } + } + } + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + +} + +static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("memtest", parse_memtest); + +static void __init early_memtest(unsigned long start, unsigned long end) +{ + unsigned long t_start, t_size; + unsigned pattern; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + for (pattern = 0; pattern < memtest_pattern; pattern++) { + t_start = start; + t_size = 0; + while (t_start < end) { + t_start = find_e820_area_size(t_start, &t_size, 1); + + /* done ? */ + if (t_start >= end) + break; + if (t_start + t_size > end) + t_size = end - t_start; + + printk(KERN_CONT "\n %016lx - %016lx pattern %d", + t_start, t_start + t_size, pattern); + + memtest(t_start, t_size, pattern); + + t_start += t_size; + } + } + printk(KERN_CONT "\n"); +} +#else +static void __init early_memtest(unsigned long start, unsigned long end) +{ +} +#endif + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ -void __init_refok init_memory_mapping(unsigned long start, unsigned long end) +unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long next; + unsigned long next, last_map_addr = end; + unsigned long start_phys = start, end_phys = end; - pr_debug("init_memory_mapping\n"); + printk(KERN_INFO "init_memory_mapping\n"); /* * Find space for the kernel direct mapping tables. @@ -411,8 +560,10 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) * memory mapped. Unfortunately this is done currently before the * nodes are discovered. */ - if (!after_bootmem) + if (!after_bootmem) { + init_gbpages(); find_early_table_space(end); + } start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -430,7 +581,7 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) next = start + PGDIR_SIZE; if (next > end) next = end; - phys_pud_init(pud, __pa(start), __pa(next)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(pud); @@ -443,6 +594,11 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) if (!after_bootmem) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start_phys, end_phys); + + return last_map_addr; } #ifndef CONFIG_NUMA @@ -464,15 +620,6 @@ void __init paging_init(void) /* * Memory hotplug specific functions */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; -} - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get @@ -482,11 +629,13 @@ int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size-1); + last_mapped_pfn = init_memory_mapping(start, start + size-1); + if (last_mapped_pfn > max_pfn_mapped) + max_pfn_mapped = last_mapped_pfn; ret = __add_pages(zone, start_pfn, nr_pages); WARN_ON(1); @@ -505,6 +654,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -596,24 +765,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { - unsigned long start = (unsigned long)_stext, end; - -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - - end = (unsigned long)__end_rodata; - start = (start + PAGE_SIZE - 1) & PAGE_MASK; - end &= PAGE_MASK; - if (end <= start) - return; - + unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -636,6 +788,7 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif } + #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -648,7 +801,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { #ifdef CONFIG_NUMA - int nid = phys_to_nid(phys); + int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; @@ -657,7 +810,7 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * This can happen with kdump kernels when accessing * firmware tables: */ - if (pfn < end_pfn_map) + if (pfn < max_pfn_mapped) return; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", @@ -667,10 +820,16 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + nid = phys_to_nid(phys); + next_nid = phys_to_nid(phys + len - 1); + if (nid == next_nid) + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + else + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); @@ -764,6 +923,10 @@ const char *arch_vma_name(struct vm_area_struct *vma) /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + int __meminit vmemmap_populate(struct page *start_page, unsigned long size, int node) { @@ -798,12 +961,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); - printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", - addr, addr + PMD_SIZE - 1, p, node); + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; } else { vmemmap_verify((pte_t *)pmd, node, addr, next); } } return 0; } + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 794895c6dcc9..804de18abcc2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -19,11 +19,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> - -enum ioremap_mode { - IOR_MODE_UNCACHED, - IOR_MODE_CACHED, -}; +#include <asm/pat.h> #ifdef CONFIG_X86_64 @@ -35,11 +31,23 @@ unsigned long __phys_addr(unsigned long x) } EXPORT_SYMBOL(__phys_addr); +static inline int phys_addr_valid(unsigned long addr) +{ + return addr < (1UL << boot_cpu_data.x86_phys_bits); +} + +#else + +static inline int phys_addr_valid(unsigned long addr) +{ + return 1; +} + #endif int page_is_ram(unsigned long pagenr) { - unsigned long addr, end; + resource_size_t addr, end; int i; /* @@ -78,19 +86,22 @@ int page_is_ram(unsigned long pagenr) * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. */ -static int ioremap_change_attr(unsigned long vaddr, unsigned long size, - enum ioremap_mode mode) +int ioremap_change_attr(unsigned long vaddr, unsigned long size, + unsigned long prot_val) { unsigned long nrpages = size >> PAGE_SHIFT; int err; - switch (mode) { - case IOR_MODE_UNCACHED: + switch (prot_val) { + case _PAGE_CACHE_UC: default: - err = set_memory_uc(vaddr, nrpages); + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_WC: + err = _set_memory_wc(vaddr, nrpages); break; - case IOR_MODE_CACHED: - err = set_memory_wb(vaddr, nrpages); + case _PAGE_CACHE_WB: + err = _set_memory_wb(vaddr, nrpages); break; } @@ -106,18 +117,28 @@ static int ioremap_change_attr(unsigned long vaddr, unsigned long size, * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, - enum ioremap_mode mode) +static void __iomem *__ioremap_caller(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val, void *caller) { - unsigned long pfn, offset, last_addr, vaddr; + unsigned long pfn, offset, vaddr; + resource_size_t last_addr; struct vm_struct *area; + unsigned long new_prot_val; pgprot_t prot; + int retval; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) return NULL; + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + (unsigned long long)phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + /* * Don't remap the low PCI/ISA area, it's always mapped.. */ @@ -127,25 +148,14 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped && - (pfn << PAGE_SHIFT) < last_addr; pfn++) { - if (page_is_ram(pfn) && pfn_valid(pfn) && - !PageReserved(pfn_to_page(pfn))) - return NULL; - } + for (pfn = phys_addr >> PAGE_SHIFT; + (pfn << PAGE_SHIFT) < last_addr; pfn++) { - switch (mode) { - case IOR_MODE_UNCACHED: - default: - /* - * FIXME: we will use UC MINUS for now, as video fb drivers - * depend on it. Upcoming ioremap_wc() will fix this behavior. - */ - prot = PAGE_KERNEL_UC_MINUS; - break; - case IOR_MODE_CACHED: - prot = PAGE_KERNEL; - break; + int is_ram = page_is_ram(pfn); + + if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + return NULL; + WARN_ON_ONCE(is_ram); } /* @@ -155,20 +165,66 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; + retval = reserve_memtype(phys_addr, phys_addr + size, + prot_val, &new_prot_val); + if (retval) { + pr_debug("Warning: reserve_memtype returned %d\n", retval); + return NULL; + } + + if (prot_val != new_prot_val) { + /* + * Do not fallback to certain memory types with certain + * requested type: + * - request is uncached, return cannot be write-back + * - request is uncached, return cannot be write-combine + * - request is write-combine, return cannot be write-back + */ + if ((prot_val == _PAGE_CACHE_UC && + (new_prot_val == _PAGE_CACHE_WB || + new_prot_val == _PAGE_CACHE_WC)) || + (prot_val == _PAGE_CACHE_WC && + new_prot_val == _PAGE_CACHE_WB)) { + pr_debug( + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + (unsigned long long)phys_addr, + (unsigned long long)(phys_addr + size), + prot_val, new_prot_val); + free_memtype(phys_addr, phys_addr + size); + return NULL; + } + prot_val = new_prot_val; + } + + switch (prot_val) { + case _PAGE_CACHE_UC: + default: + prot = PAGE_KERNEL_NOCACHE; + break; + case _PAGE_CACHE_WC: + prot = PAGE_KERNEL_WC; + break; + case _PAGE_CACHE_WB: + prot = PAGE_KERNEL; + break; + } + /* * Ok, go for it.. */ - area = get_vm_area(size, VM_IOREMAP); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { + free_memtype(phys_addr, phys_addr + size); free_vm_area(area); return NULL; } - if (ioremap_change_attr(vaddr, size, mode) < 0) { + if (ioremap_change_attr(vaddr, size, prot_val) < 0) { + free_memtype(phys_addr, phys_addr + size); vunmap(area->addr); return NULL; } @@ -199,13 +255,35 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, */ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_UC, + __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_nocache); +/** + * ioremap_wc - map memory into CPU space write combined + * @offset: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +{ + if (pat_wc_enabled) + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + __builtin_return_address(0)); + else + return ioremap_nocache(phys_addr, size); +} +EXPORT_SYMBOL(ioremap_wc); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, IOR_MODE_CACHED); + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); @@ -252,6 +330,8 @@ void iounmap(volatile void __iomem *addr) return; } + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + /* Finally remove it */ o = remove_vm_area((void *)addr); BUG_ON(p != o || o == NULL); @@ -259,6 +339,35 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void *)ioremap(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} + #ifdef CONFIG_X86_32 int __initdata early_ioremap_debug; @@ -272,8 +381,8 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] - __attribute__((aligned(PAGE_SIZE))); +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] + __section(.bss.page_aligned); static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { @@ -330,7 +439,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); - paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); + paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 7a2ebce87df5..86808e666f9c 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -164,7 +164,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (!found) return -1; - memnode_shift = compute_hash_shift(nodes, 8); + memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); return -1; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 16b82ad34b96..c5066d519e5d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -31,13 +31,15 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; +#ifdef CONFIG_SMP int x86_cpu_to_node_map_init[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; void *x86_cpu_to_node_map_early_ptr; +EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); +#endif DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); -EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE @@ -58,7 +60,7 @@ unsigned long __initdata nodemap_size; * -1 if node overlap or lost ram (shift too big) */ static int __init populate_memnodemap(const struct bootnode *nodes, - int numnodes, int shift) + int numnodes, int shift, int *nodeids) { unsigned long addr, end; int i, res = -1; @@ -74,7 +76,12 @@ static int __init populate_memnodemap(const struct bootnode *nodes, do { if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; - memnodemap[addr >> shift] = i; + + if (!nodeids) + memnodemap[addr >> shift] = i; + else + memnodemap[addr >> shift] = nodeids[i]; + addr += (1UL << shift); } while (addr < end); res = 1; @@ -137,7 +144,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, return i; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +int __init compute_hash_shift(struct bootnode *nodes, int numnodes, + int *nodeids) { int shift; @@ -147,7 +155,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - if (populate_memnodemap(nodes, numnodes, shift) != 1) { + if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { printk(KERN_INFO "Your memory is not aligned you need to " "rebuild your kernel with a bigger NODEMAPSIZE " "shift=%d\n", shift); @@ -188,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); + int nid; start = round_up(start, ZONE_ALIGN); @@ -210,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; - /* Find a place for the bootmem map */ + /* + * Find a place for the bootmem map + * nodedata_phys could be on other nodes by alloc_bootmem, + * so need to sure bootmap_start not to be small, otherwise + * early_node_mem will get that with find_e820_area instead + * of alloc_bootmem, that could clash with reserved range + */ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + nid = phys_to_nid(nodedata_phys); + if (nid == nodeid) + bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + else + bootmap_start = round_up(start, PAGE_SIZE); /* * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE @@ -237,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size, - BOOTMEM_DEFAULT); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); + /* + * convert early reserve to bootmem reserve earlier + * otherwise early_node_mem could use early reserved mem + * on previous node + */ + early_res_to_bootmem(start, end); + + /* + * in some case early_node_mem could use alloc_bootmem + * to get range on other node, don't reserve that again + */ + if (nid != nodeid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); + else + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, + pgdat_size, BOOTMEM_DEFAULT); + nid = phys_to_nid(bootmap_start); + if (nid != nodeid) + printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); + else + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, + bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); + #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif @@ -378,9 +416,10 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, * Sets up the system RAM area from start_pfn to end_pfn according to the * numa=fake command-line option. */ +static struct bootnode nodes[MAX_NUMNODES] __initdata; + static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - struct bootnode nodes[MAX_NUMNODES]; u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; @@ -460,7 +499,7 @@ done: } } out: - memnode_shift = compute_hash_shift(nodes, num_nodes); + memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0; printk(KERN_ERR "No NUMA hash function found. NUMA emulation " @@ -548,8 +587,6 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - cpu_pda(cpu)->nodenumber = node; - if(cpu_to_node_map) cpu_to_node_map[cpu] = node; else if(per_cpu_offset(cpu)) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7b79f6be4e7d..bd5e05c654dc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <asm/e820.h> #include <asm/processor.h> @@ -17,6 +19,7 @@ #include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> +#include <asm/pat.h> /* * The current flushing context - we pass it instead of 5 arguments: @@ -28,6 +31,7 @@ struct cpa_data { int numpages; int flushtlb; unsigned long pfn; + unsigned force_split : 1; }; #ifdef CONFIG_X86_64 @@ -259,6 +263,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, int i, do_split = 1; unsigned int level; + if (cpa->force_split) + return 1; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page @@ -476,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) goto out_unlock; pbase = (pte_t *)page_address(base); -#ifdef CONFIG_X86_32 - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); -#endif + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 @@ -535,7 +540,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) repeat: kpte = lookup_address(address, &level); if (!kpte) - return primary ? -EINVAL : 0; + return 0; old_pte = *kpte; if (!pte_val(old_pte)) { @@ -693,7 +698,8 @@ static inline int cache_attr(pgprot_t attr) } static int change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) + pgprot_t mask_set, pgprot_t mask_clr, + int force_split) { struct cpa_data cpa; int ret, cache, checkalias; @@ -704,7 +710,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, */ mask_set = canon_pgprot(mask_set); mask_clr = canon_pgprot(mask_clr); - if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ @@ -721,6 +727,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flushtlb = 0; + cpa.force_split = force_split; /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -759,26 +766,61 @@ out: static inline int change_page_attr_set(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); } static inline int change_page_attr_clear(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); } -int set_memory_uc(unsigned long addr, int numpages) +int _set_memory_uc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD)); + __pgprot(_PAGE_CACHE_UC)); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_UC, NULL)) + return -EINVAL; + + return _set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_memory_uc); -int set_memory_wb(unsigned long addr, int numpages) +int _set_memory_wc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_CACHE_WC)); +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + if (!pat_wc_enabled) + return set_memory_uc(addr, numpages); + + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL)) + return -EINVAL; + + return _set_memory_wc(addr, numpages); +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wb(unsigned long addr, int numpages) { return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_CACHE_MASK)); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + free_memtype(addr, addr + numpages * PAGE_SIZE); + + return _set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_memory_wb); @@ -809,6 +851,12 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); } +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), + __pgprot(0), 1); +} + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -918,6 +966,45 @@ void kernel_map_pages(struct page *page, int numpages, int enable) cpa_fill_pool(NULL); } +#ifdef CONFIG_DEBUG_FS +static int dpa_show(struct seq_file *m, void *v) +{ + seq_puts(m, "DEBUG_PAGEALLOC\n"); + seq_printf(m, "pool_size : %lu\n", pool_size); + seq_printf(m, "pool_pages : %lu\n", pool_pages); + seq_printf(m, "pool_low : %lu\n", pool_low); + seq_printf(m, "pool_used : %lu\n", pool_used); + seq_printf(m, "pool_failed : %lu\n", pool_failed); + + return 0; +} + +static int dpa_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, dpa_show, NULL); +} + +static const struct file_operations dpa_fops = { + .open = dpa_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init debug_pagealloc_proc_init(void) +{ + struct dentry *de; + + de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, + &dpa_fops); + if (!de) + return -ENOMEM; + + return 0; +} +__initcall(debug_pagealloc_proc_init); +#endif + #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c new file mode 100644 index 000000000000..277446cd30b6 --- /dev/null +++ b/arch/x86/mm/pat.c @@ -0,0 +1,596 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + */ + +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/fs.h> +#include <linux/bootmem.h> + +#include <asm/msr.h> +#include <asm/tlbflush.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/pat.h> +#include <asm/e820.h> +#include <asm/cacheflush.h> +#include <asm/fcntl.h> +#include <asm/mtrr.h> +#include <asm/io.h> + +int pat_wc_enabled = 1; + +static u64 __read_mostly boot_pat_state; + +static int nopat(char *str) +{ + pat_wc_enabled = 0; + printk(KERN_INFO "x86: PAT support disabled.\n"); + + return 0; +} +early_param("nopat", nopat); + +static int pat_known_cpu(void) +{ + if (!pat_wc_enabled) + return 0; + + if (cpu_has_pat) + return 1; + + pat_wc_enabled = 0; + printk(KERN_INFO "CPU and/or kernel does not support PAT.\n"); + return 0; +} + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ +}; + +#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) + +void pat_init(void) +{ + u64 pat; + +#ifndef CONFIG_X86_PAT + nopat(NULL); +#endif + + /* Boot CPU enables PAT based on CPU feature */ + if (!smp_processor_id() && !pat_known_cpu()) + return; + + /* APs enable PAT iff boot CPU has enabled it before */ + if (smp_processor_id() && !pat_wc_enabled) + return; + + /* Set PWT to Write-Combining. All other bits stay the same */ + /* + * PTE encoding used in Linux: + * PAT + * |PCD + * ||PWT + * ||| + * 000 WB _PAGE_CACHE_WB + * 001 WC _PAGE_CACHE_WC + * 010 UC- _PAGE_CACHE_UC_MINUS + * 011 UC _PAGE_CACHE_UC + * PAT bit unused + */ + pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | + PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); + + /* Boot CPU check */ + if (!smp_processor_id()) { + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + } + + wrmsrl(MSR_IA32_CR_PAT, pat); + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); +} + +#undef PAT + +static char *cattr_name(unsigned long flags) +{ + switch (flags & _PAGE_CACHE_MASK) { + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + default: return "broken"; + } +} + +/* + * The global memtype list keeps track of memory type for specific + * physical memory areas. Conflicting memory types in different + * mappings can cause CPU cache corruption. To avoid this we keep track. + * + * The list is sorted based on starting address and can contain multiple + * entries for each address (this allows reference counting for overlapping + * areas). All the aliases have the same cache attributes of course. + * Zero attributes are represented as holes. + * + * Currently the data structure is a list because the number of mappings + * are expected to be relatively small. If this should be a problem + * it could be changed to a rbtree or similar. + * + * memtype_lock protects the whole list. + */ + +struct memtype { + u64 start; + u64 end; + unsigned long type; + struct list_head nd; +}; + +static LIST_HEAD(memtype_list); +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, + unsigned long *ret_prot) +{ + unsigned long pat_type; + u8 mtrr_type; + + mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == 0xFF) { /* MTRR not enabled */ + *ret_prot = prot; + return 0; + } + if (mtrr_type == 0xFE) { /* MTRR match error */ + *ret_prot = _PAGE_CACHE_UC; + return -1; + } + if (mtrr_type != MTRR_TYPE_UNCACHABLE && + mtrr_type != MTRR_TYPE_WRBACK && + mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */ + *ret_prot = _PAGE_CACHE_UC; + return -1; + } + + pat_type = prot & _PAGE_CACHE_MASK; + prot &= (~_PAGE_CACHE_MASK); + + /* Currently doing intersection by hand. Optimize it later. */ + if (pat_type == _PAGE_CACHE_WC) { + *ret_prot = prot | _PAGE_CACHE_WC; + } else if (pat_type == _PAGE_CACHE_UC_MINUS) { + *ret_prot = prot | _PAGE_CACHE_UC_MINUS; + } else if (pat_type == _PAGE_CACHE_UC || + mtrr_type == MTRR_TYPE_UNCACHABLE) { + *ret_prot = prot | _PAGE_CACHE_UC; + } else if (mtrr_type == MTRR_TYPE_WRCOMB) { + *ret_prot = prot | _PAGE_CACHE_WC; + } else { + *ret_prot = prot | _PAGE_CACHE_WB; + } + + return 0; +} + +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * req_type will have a special case value '-1', when requester want to inherit + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. + * + * If ret_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If ret_type is non-null, function will return + * available type in ret_type in case of no error. In case of any error + * it will return a negative return value. + */ +int reserve_memtype(u64 start, u64 end, unsigned long req_type, + unsigned long *ret_type) +{ + struct memtype *new_entry = NULL; + struct memtype *parse; + unsigned long actual_type; + int err = 0; + + /* Only track when pat_wc_enabled */ + if (!pat_wc_enabled) { + /* This is identical to page table setting without PAT */ + if (ret_type) { + if (req_type == -1) { + *ret_type = _PAGE_CACHE_WB; + } else { + *ret_type = req_type; + } + } + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { + if (ret_type) + *ret_type = _PAGE_CACHE_WB; + + return 0; + } + + if (req_type == -1) { + /* + * Special case where caller wants to inherit from mtrr or + * existing pat mapping, defaulting to UC_MINUS in case of + * no match. + */ + u8 mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == 0xFE) { /* MTRR match error */ + err = -1; + } + + if (mtrr_type == MTRR_TYPE_WRBACK) { + req_type = _PAGE_CACHE_WB; + actual_type = _PAGE_CACHE_WB; + } else { + req_type = _PAGE_CACHE_UC_MINUS; + actual_type = _PAGE_CACHE_UC_MINUS; + } + } else { + req_type &= _PAGE_CACHE_MASK; + err = pat_x_mtrr_type(start, end, req_type, &actual_type); + } + + if (err) { + if (ret_type) + *ret_type = actual_type; + + return -EINVAL; + } + + new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + new_entry->start = start; + new_entry->end = end; + new_entry->type = actual_type; + + if (ret_type) + *ret_type = actual_type; + + spin_lock(&memtype_lock); + + /* Search for existing mapping that overlaps the current range */ + list_for_each_entry(parse, &memtype_list, nd) { + struct memtype *saved_ptr; + + if (parse->start >= end) { + pr_debug("New Entry\n"); + list_add(&new_entry->nd, parse->nd.prev); + new_entry = NULL; + break; + } + + if (start <= parse->start && end >= parse->start) { + if (actual_type != parse->type && ret_type) { + actual_type = parse->type; + *ret_type = actual_type; + new_entry->type = actual_type; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + + saved_ptr = parse; + /* + * Check to see whether the request overlaps more + * than one entry in the list + */ + list_for_each_entry_continue(parse, &memtype_list, nd) { + if (end <= parse->start) { + break; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + } + + if (err) { + break; + } + + pr_debug("Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); + /* No conflict. Go ahead and add this new entry */ + list_add(&new_entry->nd, saved_ptr->nd.prev); + new_entry = NULL; + break; + } + + if (start < parse->end) { + if (actual_type != parse->type && ret_type) { + actual_type = parse->type; + *ret_type = actual_type; + new_entry->type = actual_type; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + + saved_ptr = parse; + /* + * Check to see whether the request overlaps more + * than one entry in the list + */ + list_for_each_entry_continue(parse, &memtype_list, nd) { + if (end <= parse->start) { + break; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + } + + if (err) { + break; + } + + pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); + /* No conflict. Go ahead and add this new entry */ + list_add(&new_entry->nd, &saved_ptr->nd); + new_entry = NULL; + break; + } + } + + if (err) { + printk(KERN_INFO + "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", + start, end, cattr_name(new_entry->type), + cattr_name(req_type)); + kfree(new_entry); + spin_unlock(&memtype_lock); + return err; + } + + if (new_entry) { + /* No conflict. Not yet added to the list. Add to the tail */ + list_add_tail(&new_entry->nd, &memtype_list); + pr_debug("New Entry\n"); + } + + if (ret_type) { + pr_debug( + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(actual_type), + cattr_name(req_type), cattr_name(*ret_type)); + } else { + pr_debug( + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", + start, end, cattr_name(actual_type), + cattr_name(req_type)); + } + + spin_unlock(&memtype_lock); + return err; +} + +int free_memtype(u64 start, u64 end) +{ + struct memtype *ml; + int err = -EINVAL; + + /* Only track when pat_wc_enabled */ + if (!pat_wc_enabled) { + return 0; + } + + /* Low ISA region is always mapped WB. No need to track */ + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { + return 0; + } + + spin_lock(&memtype_lock); + list_for_each_entry(ml, &memtype_list, nd) { + if (ml->start == start && ml->end == end) { + list_del(&ml->nd); + kfree(ml); + err = 0; + break; + } + } + spin_unlock(&memtype_lock); + + if (err) { + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", + current->comm, current->pid, start, end); + } + + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; +} + + +/* + * /dev/mem mmap interface. The memtype used for mapping varies: + * - Use UC for mappings with O_SYNC flag + * - Without O_SYNC flag, if there is any conflict in reserve_memtype, + * inherit the memtype from existing mapping. + * - Else use UC_MINUS memtype (for backward compatibility with existing + * X drivers. + */ +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +#ifdef CONFIG_NONPROMISC_DEVMEM +/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#else +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) { + printk(KERN_INFO + "Program %s tried to access /dev/mem between %Lx->%Lx.\n", + current->comm, from, to); + return 0; + } + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#endif /* CONFIG_NONPROMISC_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + u64 offset = ((u64) pfn) << PAGE_SHIFT; + unsigned long flags = _PAGE_CACHE_UC_MINUS; + int retval; + + if (!range_is_allowed(pfn, size)) + return 0; + + if (file->f_flags & O_SYNC) { + flags = _PAGE_CACHE_UC; + } + +#ifdef CONFIG_X86_32 + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_wc_enabled && + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + flags = _PAGE_CACHE_UC; + } +#endif + + /* + * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * Without O_SYNC, we want to get + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + if (flags != _PAGE_CACHE_UC_MINUS) { + retval = reserve_memtype(offset, offset + size, flags, NULL); + } else { + retval = reserve_memtype(offset, offset + size, -1, &flags); + } + + if (retval < 0) + return 0; + + if (pfn <= max_pfn_mapped && + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + free_memtype(offset, offset + size); + printk(KERN_INFO + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + offset, offset + size); + return 0; + } + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + reserve_memtype(addr, addr + size, want_flags, &flags); + if (flags != want_flags) { + printk(KERN_INFO + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + addr, addr + size, + cattr_name(flags)); + } +} + +void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + + free_memtype(addr, addr + size); +} + diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 000000000000..50159764f694 --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,276 @@ +#include <linux/mm.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) + pgtable_page_ctor(pte); + return pte; +} + +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + +#if PAGETABLE_LEVELS > 2 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +#if PAGETABLE_LEVELS > 3 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + +static void pgd_ctor(void *p) +{ + pgd_t *pgd = p; + unsigned long flags; + + /* Clear usermode parts of PGD */ + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void pgd_dtor(void *pgd) +{ + unsigned long flags; /* can be called from interrupt context */ + + if (SHARED_KERNEL_PMD) + return; + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(mm, pgd); + return 0; + } + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } + + return 1; +} + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) +{ +} +#endif /* CONFIG_X86_PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + /* so that alloc_pmd can use it */ + mm->pgd = pgd; + if (pgd) + pgd_ctor(pgd); + + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + pgd_dtor(pgd); + free_page((unsigned long)pgd); + pgd = NULL; + } + + return pgd; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + free_page((unsigned long)pgd); +} + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + *ptep = entry; + pte_update_defer(vma->vm_mm, address, ptep); + flush_tlb_page(vma, address); + } + + return changed; +} + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + + return young; +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 2f9e9afcb9f4..9ee007be9142 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -1,7 +1,3 @@ -/* - * linux/arch/i386/mm/pgtable.c - */ - #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -36,7 +32,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { @@ -178,206 +173,9 @@ void reserve_top_address(unsigned long reserve) __VMALLOC_RESERVE += reserve; } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +int pmd_bad(pmd_t pmd) { - struct page *pte; + WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) - pgtable_page_ctor(pte); - return pte; + return pmd_bad_v1(pmd); } - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * -- wli - */ -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_add(&page->lru, &pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_del(&page->lru); -} - -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -static void pgd_ctor(void *p) -{ - pgd_t *pgd = p; - unsigned long flags; - - /* Clear usermode parts of PGD */ - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); - - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { - clone_pgd_range(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } - - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); -} - -static void pgd_dtor(void *pgd) -{ - unsigned long flags; /* can be called from interrupt context */ - - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -#ifdef CONFIG_X86_PAE -/* - * Mop up any pmd pages which may still be attached to the pgd. - * Normally they will be freed by munmap/exit_mmap, but any pmd we - * preallocate which never got a corresponding vma will need to be - * freed manually. - */ -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ - int i; - - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { - pgd_t pgd = pgdp[i]; - - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - - pgdp[i] = native_make_pgd(0); - - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - } - } -} - -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - pud_t *pud; - unsigned long addr; - int i; - - pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - pmd_t *pmd = pmd_alloc_one(mm, addr); - - if (!pmd) { - pgd_mop_up_pmds(mm, pgd); - return 0; - } - - if (i >= USER_PTRS_PER_PGD) - memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), - sizeof(pmd_t) * PTRS_PER_PMD); - - pud_populate(mm, pud, pmd); - } - - return 1; -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ -} -#endif /* CONFIG_X86_PAE */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - - /* so that alloc_pd can use it */ - mm->pgd = pgd; - if (pgd) - pgd_ctor(pgd); - - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; - } - - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pgd_mop_up_pmds(mm, pgd); - pgd_dtor(pgd); - free_page((unsigned long)pgd); -} - -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -{ - pgtable_page_dtor(pte); - paravirt_release_pt(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); -} - -#ifdef CONFIG_X86_PAE - -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -{ - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); -} - -#endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 845001c617cc..3890234e5b26 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -20,6 +20,7 @@ #include <asm/proto.h> #include <asm/numa.h> #include <asm/e820.h> +#include <asm/genapic.h> int acpi_numa __initdata; @@ -31,6 +32,10 @@ static struct bootnode nodes_add[MAX_NUMNODES]; static int found_add_area __initdata; int hotadd_percent __initdata = 0; +static int num_node_memblks __initdata; +static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; +static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; + /* Too small nodes confuse the VM badly. Usually they result from BIOS bugs. */ #define NODE_MIN_SIZE (4*1024*1024) @@ -40,17 +45,17 @@ static __init int setup_node(int pxm) return acpi_map_pxm_to_node(pxm); } -static __init int conflicting_nodes(unsigned long start, unsigned long end) +static __init int conflicting_memblks(unsigned long start, unsigned long end) { int i; - for_each_node_mask(i, nodes_parsed) { - struct bootnode *nd = &nodes[i]; + for (i = 0; i < num_node_memblks; i++) { + struct bootnode *nd = &node_memblk_range[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) - return i; + return memblk_nodeid[i]; if (nd->end == end && nd->start == start) - return i; + return memblk_nodeid[i]; } return -1; } @@ -132,7 +137,6 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) int pxm, node; int apic_id; - apic_id = pa->apic_id; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { @@ -148,13 +152,18 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) bad_srat(); return; } + + if (is_uv_system()) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; apicid_to_node[apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", pxm, apic_id, node); } -int update_end_of_memory(unsigned long end) {return -1;} +static int update_end_of_memory(unsigned long end) {return -1;} static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static inline int save_add_info(void) {return 1;} @@ -253,7 +262,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) bad_srat(); return; } - i = conflicting_nodes(start, end); + i = conflicting_memblks(start, end); if (i == node) { printk(KERN_WARNING "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", @@ -278,10 +287,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) nd->end = end; } - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - nd->start, nd->end); - e820_register_active_regions(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); + printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, + start, end); + e820_register_active_regions(node, start >> PAGE_SHIFT, + end >> PAGE_SHIFT); push_node_boundaries(node, nd->start >> PAGE_SHIFT, nd->end >> PAGE_SHIFT); @@ -293,6 +302,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if ((nd->start | nd->end) == 0) node_clear(node, nodes_parsed); } + + node_memblk_range[num_node_memblks].start = start; + node_memblk_range[num_node_memblks].end = end; + memblk_nodeid[num_node_memblks] = node; + num_node_memblks++; } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -363,7 +377,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); + memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, + memblk_nodeid); if (memnode_shift < 0) { printk(KERN_ERR "SRAT: No NUMA node hash function found. Contact maintainer\n"); |