summaryrefslogtreecommitdiff
path: root/arch/i386/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/mm')
-rw-r--r--arch/i386/mm/fault.c60
-rw-r--r--arch/i386/mm/highmem.c10
-rw-r--r--arch/i386/mm/init.c196
-rw-r--r--arch/i386/mm/pageattr.c6
-rw-r--r--arch/i386/mm/pgtable.c94
5 files changed, 252 insertions, 114 deletions
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index b8c4e259fc8b..f534c29e80b2 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -20,6 +20,7 @@
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */
#include <linux/highmem.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
@@ -301,7 +302,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
struct mm_struct *mm;
struct vm_area_struct * vma;
unsigned long address;
- unsigned long page;
int write, si_code;
/* get the address */
@@ -510,7 +510,9 @@ no_context:
bust_spinlocks(1);
if (oops_may_print()) {
- #ifdef CONFIG_X86_PAE
+ __typeof__(pte_val(__pte(0))) page;
+
+#ifdef CONFIG_X86_PAE
if (error_code & 16) {
pte_t *pte = lookup_address(address);
@@ -519,7 +521,7 @@ no_context:
"NX-protected page - exploit attempt? "
"(uid: %d)\n", current->uid);
}
- #endif
+#endif
if (address < PAGE_SIZE)
printk(KERN_ALERT "BUG: unable to handle kernel NULL "
"pointer dereference");
@@ -529,25 +531,38 @@ no_context:
printk(" at virtual address %08lx\n",address);
printk(KERN_ALERT " printing eip:\n");
printk("%08lx\n", regs->eip);
- }
- page = read_cr3();
- page = ((unsigned long *) __va(page))[address >> 22];
- if (oops_may_print())
+
+ page = read_cr3();
+ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+ printk(KERN_ALERT "*pdpt = %016Lx\n", page);
+ if ((page >> PAGE_SHIFT) < max_low_pfn
+ && page & _PAGE_PRESENT) {
+ page &= PAGE_MASK;
+ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+ & (PTRS_PER_PMD - 1)];
+ printk(KERN_ALERT "*pde = %016Lx\n", page);
+ page &= ~_PAGE_NX;
+ }
+#else
printk(KERN_ALERT "*pde = %08lx\n", page);
- /*
- * We must not directly access the pte in the highpte
- * case, the page table might be allocated in highmem.
- * And lets rather not kmap-atomic the pte, just in case
- * it's allocated already.
- */
-#ifndef CONFIG_HIGHPTE
- if ((page & 1) && oops_may_print()) {
- page &= PAGE_MASK;
- address &= 0x003ff000;
- page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
- printk(KERN_ALERT "*pte = %08lx\n", page);
- }
#endif
+
+ /*
+ * We must not directly access the pte in the highpte
+ * case if the page table is located in highmem.
+ * And let's rather not kmap-atomic the pte, just in case
+ * it's allocated already.
+ */
+ if ((page >> PAGE_SHIFT) < max_low_pfn
+ && (page & _PAGE_PRESENT)) {
+ page &= PAGE_MASK;
+ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+ & (PTRS_PER_PTE - 1)];
+ printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
+ }
+ }
+
tsk->thread.cr2 = address;
tsk->thread.trap_no = 14;
tsk->thread.error_code = error_code;
@@ -588,7 +603,6 @@ do_sigbus:
force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
}
-#ifndef CONFIG_X86_PAE
void vmalloc_sync_all(void)
{
/*
@@ -601,6 +615,9 @@ void vmalloc_sync_all(void)
static unsigned long start = TASK_SIZE;
unsigned long address;
+ if (SHARED_KERNEL_PMD)
+ return;
+
BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
if (!test_bit(pgd_index(address), insync)) {
@@ -623,4 +640,3 @@ void vmalloc_sync_all(void)
start = address + PGDIR_SIZE;
}
}
-#endif
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index ac70d09df7ee..ad8d86cc683e 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -26,7 +26,7 @@ void kunmap(struct page *page)
* However when holding an atomic kmap is is not legal to sleep, so atomic
* kmaps are appropriate for short, tight code paths only.
*/
-void *kmap_atomic(struct page *page, enum km_type type)
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
@@ -41,12 +41,17 @@ void *kmap_atomic(struct page *page, enum km_type type)
return page_address(page);
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
arch_flush_lazy_mmu_mode();
return (void*) vaddr;
}
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+ return kmap_atomic_prot(page, type, kmap_prot);
+}
+
void kunmap_atomic(void *kvaddr, enum km_type type)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -67,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
#endif
}
+ arch_flush_lazy_mmu_mode();
pagefault_enable();
}
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index ae436882af7a..dbe16f63a566 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -22,6 +22,7 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
#include <linux/slab.h>
@@ -42,6 +43,7 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/paravirt.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -61,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
- set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- pud = pud_offset(pgd, 0);
- if (pmd_table != pmd_offset(pud, 0))
- BUG();
-#else
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+ paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ pud = pud_offset(pgd, 0);
+ if (pmd_table != pmd_offset(pud, 0))
+ BUG();
+ }
+#endif
pud = pud_offset(pgd, 0);
pmd_table = pmd_offset(pud, 0);
-#endif
-
return pmd_table;
}
@@ -81,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
*/
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
- if (pmd_none(*pmd)) {
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
- if (page_table != pte_offset_kernel(pmd, 0))
- BUG();
-
- return page_table;
+ BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
return pte_offset_kernel(pmd, 0);
@@ -108,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
{
pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
int pgd_idx, pmd_idx;
unsigned long vaddr;
@@ -119,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
pgd = pgd_base + pgd_idx;
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
- if (pgd_none(*pgd))
- one_md_table_init(pgd);
- pud = pud_offset(pgd, vaddr);
- pmd = pmd_offset(pud, vaddr);
+ pmd = one_md_table_init(pgd);
+ pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
- if (pmd_none(*pmd))
- one_page_table_init(pmd);
+ one_page_table_init(pmd);
vaddr += PMD_SIZE;
}
@@ -167,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
/* Map with big pages if possible, otherwise create normal page tables. */
if (cpu_has_pse) {
unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
-
if (is_kernel_text(address) || is_kernel_text(address2))
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
else
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+
pfn += PTRS_PER_PTE;
} else {
pte = one_page_table_init(pmd);
- for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
- if (is_kernel_text(address))
- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
- else
- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+ for (pte_ofs = 0;
+ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
+ if (is_kernel_text(address))
+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+ else
+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
}
}
}
@@ -337,24 +336,78 @@ extern void __init remap_numa_kva(void);
#define remap_numa_kva() do {} while (0)
#endif
-static void __init pagetable_init (void)
+void __init native_pagetable_setup_start(pgd_t *base)
{
- unsigned long vaddr;
- pgd_t *pgd_base = swapper_pg_dir;
-
#ifdef CONFIG_X86_PAE
int i;
- /* Init entries of the first-level page table to the zero page */
- for (i = 0; i < PTRS_PER_PGD; i++)
- set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+
+ /*
+ * Init entries of the first-level page table to the
+ * zero page, if they haven't already been set up.
+ *
+ * In a normal native boot, we'll be running on a
+ * pagetable rooted in swapper_pg_dir, but not in PAE
+ * mode, so this will end up clobbering the mappings
+ * for the lower 24Mbytes of the address space,
+ * without affecting the kernel address space.
+ */
+ for (i = 0; i < USER_PTRS_PER_PGD; i++)
+ set_pgd(&base[i],
+ __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+
+ /* Make sure kernel address space is empty so that a pagetable
+ will be allocated for it. */
+ memset(&base[USER_PTRS_PER_PGD], 0,
+ KERNEL_PGD_PTRS * sizeof(pgd_t));
#else
paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
#endif
+}
+
+void __init native_pagetable_setup_done(pgd_t *base)
+{
+#ifdef CONFIG_X86_PAE
+ /*
+ * Add low memory identity-mappings - SMP needs it when
+ * starting up on an AP from real-mode. In the non-PAE
+ * case we already have these mappings through head.S.
+ * All user-space mappings are explicitly cleared after
+ * SMP startup.
+ */
+ set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
+#endif
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings. Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/i386/kernel/head.S, and not running in PAE mode
+ * (even if we'll end up running in PAE). The root of the pagetable
+ * will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir. In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+static void __init pagetable_init (void)
+{
+ unsigned long vaddr, end;
+ pgd_t *pgd_base = swapper_pg_dir;
+
+ paravirt_pagetable_setup_start(pgd_base);
/* Enable PSE if available */
- if (cpu_has_pse) {
+ if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
- }
/* Enable PGE if available */
if (cpu_has_pge) {
@@ -371,20 +424,12 @@ static void __init pagetable_init (void)
* created - mappings will be set by set_fixmap():
*/
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
- page_table_range_init(vaddr, 0, pgd_base);
+ end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+ page_table_range_init(vaddr, end, pgd_base);
permanent_kmaps_init(pgd_base);
-#ifdef CONFIG_X86_PAE
- /*
- * Add low memory identity-mappings - SMP needs it when
- * starting up on an AP from real-mode. In the non-PAE
- * case we already have these mappings through head.S.
- * All user-space mappings are explicitly cleared after
- * SMP startup.
- */
- set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
-#endif
+ paravirt_pagetable_setup_done(pgd_base);
}
#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
@@ -700,6 +745,8 @@ struct kmem_cache *pmd_cache;
void __init pgtable_cache_init(void)
{
+ size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
+
if (PTRS_PER_PMD > 1) {
pmd_cache = kmem_cache_create("pmd",
PTRS_PER_PMD*sizeof(pmd_t),
@@ -709,13 +756,23 @@ void __init pgtable_cache_init(void)
NULL);
if (!pmd_cache)
panic("pgtable_cache_init(): cannot create pmd cache");
+
+ if (!SHARED_KERNEL_PMD) {
+ /* If we're in PAE mode and have a non-shared
+ kernel pmd, then the pgd size must be a
+ page size. This is because the pgd_list
+ links through the page structure, so there
+ can only be one pgd per page for this to
+ work. */
+ pgd_size = PAGE_SIZE;
+ }
}
pgd_cache = kmem_cache_create("pgd",
- PTRS_PER_PGD*sizeof(pgd_t),
- PTRS_PER_PGD*sizeof(pgd_t),
+ pgd_size,
+ pgd_size,
0,
pgd_ctor,
- PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+ (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL);
if (!pgd_cache)
panic("pgtable_cache_init(): Cannot create pgd cache");
}
@@ -751,13 +808,25 @@ static int noinline do_test_wp_bit(void)
void mark_rodata_ro(void)
{
- unsigned long addr = (unsigned long)__start_rodata;
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
- for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
- change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
+#ifdef CONFIG_HOTPLUG_CPU
+ /* It must still be possible to apply SMP alternatives. */
+ if (num_possible_cpus() <= 1)
+#endif
+ {
+ change_page_attr(virt_to_page(start),
+ size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+ printk("Write protecting the kernel text: %luk\n", size >> 10);
+ }
- printk("Write protecting the kernel read-only data: %uk\n",
- (__end_rodata - __start_rodata) >> 10);
+ start += size;
+ size = (unsigned long)__end_rodata - start;
+ change_page_attr(virt_to_page(start),
+ size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+ printk("Write protecting the kernel read-only data: %luk\n",
+ size >> 10);
/*
* change_page_attr() requires a global_flush_tlb() call after it.
@@ -774,26 +843,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
+ struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+ ClearPageReserved(page);
+ init_page_count(page);
+ memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+ __free_page(page);
totalram_pages++;
}
- printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
+ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
}
void free_initmem(void)
{
free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
+ __pa_symbol(&__init_begin),
+ __pa_symbol(&__init_end));
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- free_init_pages("initrd memory", start, end);
+ free_init_pages("initrd memory", __pa(start), __pa(end));
}
#endif
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 412ebbd8adb0..47bd477c8ecc 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
unsigned long flags;
set_pte_atomic(kpte, pte); /* change init_mm */
- if (PTRS_PER_PMD > 1)
+ if (SHARED_KERNEL_PMD)
return;
spin_lock_irqsave(&pgd_lock, flags);
@@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
return -EINVAL;
kpte_page = virt_to_page(kpte);
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
- if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
+ if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, prot));
} else {
pgprot_t ref_prot;
@@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
kpte_page = split;
}
page_private(kpte_page)++;
- } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
+ } else if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
BUG_ON(page_private(kpte_page) == 0);
page_private(kpte_page)--;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index fa0cfbd551e1..9a96c1647428 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -144,10 +144,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
}
static int fixmaps;
-#ifndef CONFIG_COMPAT_VDSO
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
-#endif
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
@@ -173,12 +171,8 @@ void reserve_top_address(unsigned long reserve)
BUG_ON(fixmaps > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
-#ifdef CONFIG_COMPAT_VDSO
- BUG_ON(reserve != 0);
-#else
__FIXADDR_TOP = -reserve - PAGE_SIZE;
__VMALLOC_RESERVE += reserve;
-#endif
}
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -238,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd)
set_page_private(next, (unsigned long)pprev);
}
+#if (PTRS_PER_PMD == 1)
+/* Non-PAE pgd constructor */
void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
{
unsigned long flags;
- if (PTRS_PER_PMD == 1) {
- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
- spin_lock_irqsave(&pgd_lock, flags);
- }
+ /* !PAE, no pagetable sharing */
+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ /* must happen under lock */
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
KERNEL_PGD_PTRS);
-
- if (PTRS_PER_PMD > 1)
- return;
-
- /* must happen under lock */
paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
-
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
pgd_list_add(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
+#else /* PTRS_PER_PMD > 1 */
+/* PAE pgd constructor */
+void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
+{
+ /* PAE, kernel PMD may be shared */
+
+ if (SHARED_KERNEL_PMD) {
+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+ swapper_pg_dir + USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
+ } else {
+ unsigned long flags;
+
+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+ spin_lock_irqsave(&pgd_lock, flags);
+ pgd_list_add(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+}
+#endif /* PTRS_PER_PMD */
-/* never called when PTRS_PER_PMD > 1 */
void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
{
unsigned long flags; /* can be called from interrupt context */
+ BUG_ON(SHARED_KERNEL_PMD);
+
paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+
+/* If we allocate a pmd for part of the kernel address space, then
+ make sure its initialized with the appropriate kernel mappings.
+ Otherwise use a cached zeroed pmd. */
+static pmd_t *pmd_cache_alloc(int idx)
+{
+ pmd_t *pmd;
+
+ if (idx >= USER_PTRS_PER_PGD) {
+ pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+
+ if (pmd)
+ memcpy(pmd,
+ (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+ } else
+ pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+
+ return pmd;
+}
+
+static void pmd_cache_free(pmd_t *pmd, int idx)
+{
+ if (idx >= USER_PTRS_PER_PGD)
+ free_page((unsigned long)pmd);
+ else
+ kmem_cache_free(pmd_cache, pmd);
+}
+
pgd_t *pgd_alloc(struct mm_struct *mm)
{
int i;
@@ -282,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
if (PTRS_PER_PMD == 1 || !pgd)
return pgd;
- for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
- pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+ for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+ pmd_t *pmd = pmd_cache_alloc(i);
+
if (!pmd)
goto out_oom;
+
paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
}
@@ -296,7 +342,7 @@ out_oom:
pgd_t pgdent = pgd[i];
void* pmd = (void *)__va(pgd_val(pgdent)-1);
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- kmem_cache_free(pmd_cache, pmd);
+ pmd_cache_free(pmd, i);
}
kmem_cache_free(pgd_cache, pgd);
return NULL;
@@ -308,11 +354,11 @@ void pgd_free(pgd_t *pgd)
/* in the PAE case user pgd entries are overwritten before usage */
if (PTRS_PER_PMD > 1)
- for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+ for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
pgd_t pgdent = pgd[i];
void* pmd = (void *)__va(pgd_val(pgdent)-1);
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- kmem_cache_free(pmd_cache, pmd);
+ pmd_cache_free(pmd, i);
}
/* in the non-PAE case, free_pgtables() clears user pgd entries */
kmem_cache_free(pgd_cache, pgd);