diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-12 14:27:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-12 14:27:24 -0800 |
commit | 09cea96caa59fabab3030c53bd698b9b568d959a (patch) | |
tree | a991cdc0c887fdcda37f4b751ee98d3db9559f4e /arch/powerpc/mm | |
parent | 6eb7365db6f3a4a9d8d9922bb0b800f9cbaad641 (diff) | |
parent | e090aa80321b64c3b793f3b047e31ecf1af9538d (diff) |
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (151 commits)
powerpc: Fix usage of 64-bit instruction in 32-bit altivec code
MAINTAINERS: Add PowerPC patterns
powerpc/pseries: Track previous CPPR values to correctly EOI interrupts
powerpc/pseries: Correct pseries/dlpar.c build break without CONFIG_SMP
powerpc: Make "intspec" pointers in irq_host->xlate() const
powerpc/8xx: DTLB Miss cleanup
powerpc/8xx: Remove DIRTY pte handling in DTLB Error.
powerpc/8xx: Start using dcbX instructions in various copy routines
powerpc/8xx: Restore _PAGE_WRITETHRU
powerpc/8xx: Add missing Guarded setting in DTLB Error.
powerpc/8xx: Fixup DAR from buggy dcbX instructions.
powerpc/8xx: Tag DAR with 0x00f0 to catch buggy instructions.
powerpc/8xx: Update TLB asm so it behaves as linux mm expects.
powerpc/8xx: Invalidate non present TLBs
powerpc/pseries: Serialize cpu hotplug operations during deactivate Vs deallocate
pseries/pseries: Add code to online/offline CPUs of a DLPAR node
powerpc: stop_this_cpu: remove the cpu from the online map.
powerpc/pseries: Add kernel based CPU DLPAR handling
sysfs/cpu: Add probe/release files
powerpc/pseries: Kernel DLPAR Infrastructure
...
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/Makefile | 5 | ||||
-rw-r--r-- | arch/powerpc/mm/fault.c | 8 | ||||
-rw-r--r-- | arch/powerpc/mm/fsl_booke_mmu.c | 132 | ||||
-rw-r--r-- | arch/powerpc/mm/gup.c | 149 | ||||
-rw-r--r-- | arch/powerpc/mm/hash_utils_64.c | 46 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage-hash64.c | 139 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 792 | ||||
-rw-r--r-- | arch/powerpc/mm/init_64.c | 76 | ||||
-rw-r--r-- | arch/powerpc/mm/mem.c | 17 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_context_hash64.c | 26 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_decl.h | 11 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable.c | 25 | ||||
-rw-r--r-- | arch/powerpc/mm/subpage-prot.c | 15 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_hash64.c | 8 |
14 files changed, 681 insertions, 768 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 6fb8fc8d2fea..ce68708bbad5 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -28,7 +28,10 @@ obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +ifeq ($(CONFIG_HUGETLB_PAGE),y) +obj-y += hugetlbpage.o +obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +endif obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index e7dae82c1285..26fb6b990b0a 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -40,7 +40,7 @@ #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/siginfo.h> - +#include <mm/mmu_decl.h> #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs) @@ -246,6 +246,12 @@ good_area: goto bad_area; #endif /* CONFIG_6xx */ #if defined(CONFIG_8xx) + /* 8xx sometimes need to load a invalid/non-present TLBs. + * These must be invalidated separately as linux mm don't. + */ + if (error_code & 0x40000000) /* no translation? */ + _tlbil_va(address, 0, 0, 0); + /* The MPC8xx seems to always set 0x80000000, which is * "undefined". Of those that can be set, this is the only * one which seems bad. diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index dc93e95b256e..fcfcb6e976c7 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -54,26 +54,35 @@ #include "mmu_decl.h" -extern void loadcam_entry(unsigned int index); unsigned int tlbcam_index; -static unsigned long cam[CONFIG_LOWMEM_CAM_NUM]; -#define NUM_TLBCAMS (16) +#define NUM_TLBCAMS (64) #if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" #endif -struct tlbcam TLBCAM[NUM_TLBCAMS]; +struct tlbcam { + u32 MAS0; + u32 MAS1; + unsigned long MAS2; + u32 MAS3; + u32 MAS7; +} TLBCAM[NUM_TLBCAMS]; struct tlbcamrange { - unsigned long start; + unsigned long start; unsigned long limit; phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; extern unsigned int tlbcam_index; +unsigned long tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + /* * Return PA for this VA if it is mapped by a CAM, or 0 */ @@ -94,23 +103,36 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa) int b; for (b = 0; b < tlbcam_index; ++b) if (pa >= tlbcam_addrs[b].phys - && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) +tlbcam_addrs[b].phys) return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); return 0; } +void loadcam_entry(int idx) +{ + mtspr(SPRN_MAS0, TLBCAM[idx].MAS0); + mtspr(SPRN_MAS1, TLBCAM[idx].MAS1); + mtspr(SPRN_MAS2, TLBCAM[idx].MAS2); + mtspr(SPRN_MAS3, TLBCAM[idx].MAS3); + + if (cur_cpu_spec->cpu_features & MMU_FTR_BIG_PHYS) + mtspr(SPRN_MAS7, TLBCAM[idx].MAS7); + + asm volatile("isync;tlbwe;isync" : : : "memory"); +} + /* * Set up one of the I/D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 4 between 4k and 256M. */ -void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags, unsigned int pid) +static void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned long size, unsigned long flags, unsigned int pid) { unsigned int tsize, lz; - asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size)); + asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (size)); tsize = 21 - lz; #ifdef CONFIG_SMP @@ -128,8 +150,10 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + if (cur_cpu_spec->cpu_features & MMU_FTR_BIG_PHYS) + TLBCAM[index].MAS7 = (u64)phys >> 32; #ifndef CONFIG_KGDB /* want user access for breakpoints */ if (flags & _PAGE_USER) { @@ -148,27 +172,44 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys, loadcam_entry(index); } -void invalidate_tlbcam_entry(int index) -{ - TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index); - TLBCAM[index].MAS1 = ~MAS1_VALID; - - loadcam_entry(index); -} - -unsigned long __init mmu_mapin_ram(void) +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) { + int i; unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; + unsigned long amount_mapped = 0; + unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf; + + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = max_cam * 2 + 10; - while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) { - settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); - virt += cam[tlbcam_index]; - phys += cam[tlbcam_index]; - tlbcam_index++; + /* Calculate CAM values */ + for (i = 0; ram && i < max_cam_idx; i++) { + unsigned int camsize = __ilog2(ram) & ~1U; + unsigned int align = __ffs(virt | phys) & ~1U; + unsigned long cam_sz; + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + cam_sz = 1UL << camsize; + settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0); + + ram -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; } + tlbcam_index = i; + + return amount_mapped; +} - return virt - PAGE_OFFSET; +unsigned long __init mmu_mapin_ram(void) +{ + return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } /* @@ -179,46 +220,21 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -void __init -adjust_total_lowmem(void) +void __init adjust_total_lowmem(void) { - phys_addr_t ram; - unsigned int max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xff; - char buf[ARRAY_SIZE(cam) * 5 + 1], *p = buf; + unsigned long ram; int i; - unsigned long virt = PAGE_OFFSET & 0xffffffffUL; - unsigned long phys = memstart_addr & 0xffffffffUL; - - /* Convert (4^max) kB to (2^max) bytes */ - max_cam = max_cam * 2 + 10; /* adjust lowmem size to __max_low_memory */ ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); - /* Calculate CAM values */ - __max_low_memory = 0; - for (i = 0; ram && i < ARRAY_SIZE(cam); i++) { - unsigned int camsize = __ilog2(ram) & ~1U; - unsigned int align = __ffs(virt | phys) & ~1U; + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM); - if (camsize > align) - camsize = align; - if (camsize > max_cam) - camsize = max_cam; - - cam[i] = 1UL << camsize; - ram -= cam[i]; - __max_low_memory += cam[i]; - virt += cam[i]; - phys += cam[i]; - - p += sprintf(p, "%lu/", cam[i] >> 20); - } - for (; i < ARRAY_SIZE(cam); i++) - p += sprintf(p, "0/"); - p[-1] = '\0'; - - pr_info("Memory CAM mapping: %s Mb, residual: %dMb\n", buf, + pr_info("Memory CAM mapping: "); + for (i = 0; i < tlbcam_index - 1; i++) + pr_cont("%lu/", tlbcam_sz(i) >> 20); + pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, (unsigned int)((total_lowmem - __max_low_memory) >> 20)); + __initial_memory_limit_addr = memstart_addr + __max_low_memory; } diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index bc122a120bf0..d7efdbf640c7 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 1; } -#ifdef CONFIG_HUGETLB_PAGE -static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, - unsigned long *addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long mask; - unsigned long pte_end; - struct page *head, *page; - pte_t pte; - int refs; - - pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); - if (pte_end < end) - end = pte_end; - - pte = *ptep; - mask = _PAGE_PRESENT|_PAGE_USER; - if (write) - mask |= _PAGE_RW; - if ((pte_val(pte) & mask) != mask) - return 0; - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (*addr += PAGE_SIZE, *addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - while (*nr) { - put_page(page); - (*nr)--; - } - } - - return 1; -} -#endif /* CONFIG_HUGETLB_PAGE */ - static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); if (pmd_none(pmd)) return 0; - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + if (is_hugepd(pmdp)) { + if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, + addr, next, write, pages, nr)) + return 0; + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none(pud)) return 0; - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + if (is_hugepd(pudp)) { + if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, + addr, next, write, pages, nr)) + return 0; + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, unsigned long next; pgd_t *pgdp; int nr = 0; -#ifdef CONFIG_PPC64 - unsigned int shift; - int psize; -#endif pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); @@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pr_devel(" aligned: %lx .. %lx\n", start, end); -#ifdef CONFIG_HUGETLB_PAGE - /* We bail out on slice boundary crossing when hugetlb is - * enabled in order to not have to deal with two different - * page table formats - */ - if (addr < SLICE_LOW_TOP) { - if (end > SLICE_LOW_TOP) - goto slow_irqon; - - if (unlikely(GET_LOW_SLICE_INDEX(addr) != - GET_LOW_SLICE_INDEX(end - 1))) - goto slow_irqon; - } else { - if (unlikely(GET_HIGH_SLICE_INDEX(addr) != - GET_HIGH_SLICE_INDEX(end - 1))) - goto slow_irqon; - } -#endif /* CONFIG_HUGETLB_PAGE */ - /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by @@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, */ local_irq_disable(); -#ifdef CONFIG_PPC64 - /* Those bits are related to hugetlbfs implementation and only exist - * on 64-bit for now - */ - psize = get_slice_psize(mm, addr); - shift = mmu_psize_defs[psize].shift; -#endif /* CONFIG_PPC64 */ - -#ifdef CONFIG_HUGETLB_PAGE - if (unlikely(mmu_huge_psizes[psize])) { - pte_t *ptep; - unsigned long a = addr; - unsigned long sz = ((1UL) << shift); - struct hstate *hstate = size_to_hstate(sz); - - BUG_ON(!hstate); - /* - * XXX: could be optimized to avoid hstate - * lookup entirely (just use shift) - */ - - do { - VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); - ptep = huge_pte_offset(mm, a); - pr_devel(" %016lx: huge ptep %p\n", a, ptep); - if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, - &nr)) - goto slow; - } while (a != end); - } else -#endif /* CONFIG_HUGETLB_PAGE */ - { - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - -#ifdef CONFIG_PPC64 - VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); -#endif - pr_devel(" %016lx: normal pgd %p\n", addr, - (void *)pgd_val(pgd)); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + pr_devel(" %016lx: normal pgd %p\n", addr, + (void *)pgd_val(pgd)); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (is_hugepd(pgdp)) { + if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, + addr, next, write, pages, &nr)) goto slow; - } while (pgdp++, addr = next, addr != end); - } + } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 1ade7eb6ae00..50f867d657df 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; struct hash_pte *htab_address; unsigned long htab_size_bytes; unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); int mmu_linear_psize = MMU_PAGE_4K; int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; @@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; +EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_HUGETLB_PAGE unsigned int HPAGE_SHIFT; #endif @@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void) #ifdef CONFIG_HUGETLB_PAGE /* Reserve 16G huge page memory sections for huge pages */ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); - -/* Set default large page size. Currently, we pick 16M or 1M depending - * on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; - /* With 4k/4level pagetables, we can't (for now) cope with a - * huge page size < PMD_SIZE */ - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; #endif /* CONFIG_HUGETLB_PAGE */ } @@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) /* page is dirty */ if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { if (trap == 0x400) { - __flush_dcache_icache(page_address(page)); + flush_dcache_icache_page(page); set_bit(PG_arch_1, &page->flags); } else pp |= HPTE_R_N; @@ -843,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) * Result is 0: full permissions, _PAGE_RW: read-only, * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. */ -static int subpage_protection(pgd_t *pgdir, unsigned long ea) +static int subpage_protection(struct mm_struct *mm, unsigned long ea) { - struct subpage_prot_table *spt = pgd_subpage_prot(pgdir); + struct subpage_prot_table *spt = &mm->context.spt; u32 spp = 0; u32 **sbpm, *sbpp; @@ -873,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea) } #else /* CONFIG_PPC_SUBPAGE_PROT */ -static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) +static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) { return 0; } @@ -891,6 +883,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) unsigned long vsid; struct mm_struct *mm; pte_t *ptep; + unsigned hugeshift; const struct cpumask *tmp; int rc, user_region = 0, local = 0; int psize, ssize; @@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) local = 1; -#ifdef CONFIG_HUGETLB_PAGE - /* Handle hugepage regions */ - if (HPAGE_SHIFT && mmu_huge_psizes[psize]) { - DBG_LOW(" -> huge page !\n"); - return hash_huge_page(mm, access, ea, vsid, local, trap); - } -#endif /* CONFIG_HUGETLB_PAGE */ - #ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we are hitting - * a special driver mapping, we need to align the address before - * we fetch the PTE + /* If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. */ if (psize != MMU_PAGE_4K) ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = find_linux_pte(pgdir, ea); + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); return 1; } +#ifdef CONFIG_HUGETLB_PAGE + if (hugeshift) + return __hash_page_huge(ea, access, vsid, ptep, trap, local, + ssize, hugeshift, psize); +#endif /* CONFIG_HUGETLB_PAGE */ + #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); #else diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c new file mode 100644 index 000000000000..199539882f92 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -0,0 +1,139 @@ +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, int local, int ssize, + unsigned int shift, unsigned int mmu_psize) +{ + unsigned long old_pte, new_pte; + unsigned long va, rflags, pa, sz; + long slot; + int err = 1; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + va = hpt_va(ea, vsid, ssize); + + /* + * Check the user's access rights to the page. If access should be + * prevented then send the problem up to do_page_fault. + */ + if (unlikely(access & ~pte_val(*ptep))) + goto out; + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + if (old_pte & _PAGE_BUSY) + goto out; + new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, + old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); + sz = ((1UL) << shift); + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & _PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long hash, slot; + + hash = hpt_hash(va, shift, ssize); + if (old_pte & _PAGE_F_SECOND) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (old_pte & _PAGE_F_GIX) >> 12; + + if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, + ssize, local) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(va, shift, ssize); + unsigned long hpte_group; + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + +repeat: + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + /* clear HPTE slot informations in new PTE */ +#ifdef CONFIG_PPC_64K_PAGES + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; +#else + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; +#endif + /* Add in WIMG bits */ + rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, + mmu_psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, + HPTE_V_SECONDARY, + mmu_psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP)&~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + + if (unlikely(slot == -2)) + panic("hash_huge_page: pte_insert failed\n"); + + new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~_PAGE_BUSY); + + err = 0; + + out: + return err; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 90df6ffe3a43..123f7070238a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -7,29 +7,17 @@ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */ -#include <linux/init.h> -#include <linux/fs.h> #include <linux/mm.h> +#include <linux/io.h> #include <linux/hugetlb.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/sysctl.h> -#include <asm/mman.h> +#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/machdep.h> -#include <asm/cputable.h> -#include <asm/spu.h> #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 #define PAGE_SHIFT_16G 34 -#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) -#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) #define MAX_NUMBER_GPAGES 1024 /* Tracks the 16G pages after the device tree is scanned and before the @@ -37,53 +25,17 @@ static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; static unsigned nr_gpages; -/* Array of valid huge page sizes - non-zero value(hugepte_shift) is - * stored for the huge page sizes that are valid. - */ -unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ - -#define hugepte_shift mmu_huge_psizes -#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) -#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) - -#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ - + hugepte_shift[psize]) -#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) -#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) - -/* Subtract one from array size because we don't need a cache for 4K since - * is not a huge page size */ -#define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1) -#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) - -static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { - [MMU_PAGE_64K] = "hugepte_cache_64K", - [MMU_PAGE_1M] = "hugepte_cache_1M", - [MMU_PAGE_16M] = "hugepte_cache_16M", - [MMU_PAGE_16G] = "hugepte_cache_16G", -}; - /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() * will choke on pointers to hugepte tables, which is handy for * catching screwups early. */ -#define HUGEPD_OK 0x1 - -typedef struct { unsigned long pd; } hugepd_t; - -#define hugepd_none(hpd) ((hpd).pd == 0) static inline int shift_to_mmu_psize(unsigned int shift) { - switch (shift) { -#ifndef CONFIG_PPC_64K_PAGES - case PAGE_SHIFT_64K: - return MMU_PAGE_64K; -#endif - case PAGE_SHIFT_16M: - return MMU_PAGE_16M; - case PAGE_SHIFT_16G: - return MMU_PAGE_16G; - } + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) + if (mmu_psize_defs[psize].shift == shift) + return psize; return -1; } @@ -94,71 +46,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +#define hugepd_none(hpd) ((hpd).pd == 0) + static inline pte_t *hugepd_page(hugepd_t hpd) { - BUG_ON(!(hpd.pd & HUGEPD_OK)); - return (pte_t *)(hpd.pd & ~HUGEPD_OK); + BUG_ON(!hugepd_ok(hpd)); + return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return hpd.pd & HUGEPD_SHIFT_MASK; } -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, - struct hstate *hstate) +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) { - unsigned int shift = huge_page_shift(hstate); - int psize = shift_to_mmu_psize(shift); - unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); + unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); pte_t *dir = hugepd_page(*hpdp); return dir + idx; } +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (shift) + *shift = 0; + + pg = pgdir + pgd_index(ea); + if (is_hugepd(pg)) { + hpdp = (hugepd_t *)pg; + } else if (!pgd_none(*pg)) { + pdshift = PUD_SHIFT; + pu = pud_offset(pg, ea); + if (is_hugepd(pu)) + hpdp = (hugepd_t *)pu; + else if (!pud_none(*pu)) { + pdshift = PMD_SHIFT; + pm = pmd_offset(pu, ea); + if (is_hugepd(pm)) + hpdp = (hugepd_t *)pm; + else if (!pmd_none(*pm)) { + return pte_offset_map(pm, ea); + } + } + } + + if (!hpdp) + return NULL; + + if (shift) + *shift = hugepd_shift(*hpdp); + return hugepte_offset(hpdp, ea, pdshift); +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); +} + static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address, unsigned int psize) + unsigned long address, unsigned pdshift, unsigned pshift) { - pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], - GFP_KERNEL|__GFP_REPEAT); + pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), + GFP_KERNEL|__GFP_REPEAT); + + BUG_ON(pshift > HUGEPD_SHIFT_MASK); + BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); if (! new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!hugepd_none(*hpdp)) - kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); + kmem_cache_free(PGT_CACHE(pdshift - pshift), new); else - hpdp->pd = (unsigned long)new | HUGEPD_OK; + hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; spin_unlock(&mm->page_table_lock); return 0; } - -static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) -{ - if (huge_page_shift(hstate) < PUD_SHIFT) - return pud_offset(pgd, addr); - else - return (pud_t *) pgd; -} -static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, - struct hstate *hstate) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - if (huge_page_shift(hstate) < PUD_SHIFT) - return pud_alloc(mm, pgd, addr); - else - return (pud_t *) pgd; -} -static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) -{ - if (huge_page_shift(hstate) < PMD_SHIFT) - return pmd_offset(pud, addr); - else - return (pmd_t *) pud; -} -static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, - struct hstate *hstate) -{ - if (huge_page_shift(hstate) < PMD_SHIFT) - return pmd_alloc(mm, pud, addr); - else - return (pmd_t *) pud; + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pshift = __ffs(sz); + unsigned pdshift = PGDIR_SHIFT; + + addr &= ~(sz-1); + + pg = pgd_offset(mm, addr); + if (pshift >= PUD_SHIFT) { + hpdp = (hugepd_t *)pg; + } else { + pdshift = PUD_SHIFT; + pu = pud_alloc(mm, pg, addr); + if (pshift >= PMD_SHIFT) { + hpdp = (hugepd_t *)pu; + } else { + pdshift = PMD_SHIFT; + pm = pmd_alloc(mm, pu, addr); + hpdp = (hugepd_t *)pm; + } + } + + if (!hpdp) + return NULL; + + BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + return NULL; + + return hugepte_offset(hpdp, addr, pdshift); } /* Build list of addresses of gigantic pages. This function is used in early @@ -192,94 +199,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate) return 1; } - -/* Modelled after find_linux_pte() */ -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - - unsigned int psize; - unsigned int shift; - unsigned long sz; - struct hstate *hstate; - psize = get_slice_psize(mm, addr); - shift = mmu_psize_to_shift(psize); - sz = ((1UL) << shift); - hstate = size_to_hstate(sz); - - addr &= hstate->mask; - - pg = pgd_offset(mm, addr); - if (!pgd_none(*pg)) { - pu = hpud_offset(pg, addr, hstate); - if (!pud_none(*pu)) { - pm = hpmd_offset(pu, addr, hstate); - if (!pmd_none(*pm)) - return hugepte_offset((hugepd_t *)pm, addr, - hstate); - } - } - - return NULL; -} - -pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - hugepd_t *hpdp = NULL; - struct hstate *hstate; - unsigned int psize; - hstate = size_to_hstate(sz); - - psize = get_slice_psize(mm, addr); - BUG_ON(!mmu_huge_psizes[psize]); - - addr &= hstate->mask; - - pg = pgd_offset(mm, addr); - pu = hpud_alloc(mm, pg, addr, hstate); - - if (pu) { - pm = hpmd_alloc(mm, pu, addr, hstate); - if (pm) - hpdp = (hugepd_t *)pm; - } - - if (! hpdp) - return NULL; - - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) - return NULL; - - return hugepte_offset(hpdp, addr, hstate); -} - int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } -static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, - unsigned int psize) +static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, + unsigned long start, unsigned long end, + unsigned long floor, unsigned long ceiling) { pte_t *hugepte = hugepd_page(*hpdp); + unsigned shift = hugepd_shift(*hpdp); + unsigned long pdmask = ~((1UL << pdshift) - 1); + + start &= pdmask; + if (start < floor) + return; + if (ceiling) { + ceiling &= pdmask; + if (! ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; hpdp->pd = 0; tlb->need_flush = 1; - pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, - HUGEPTE_CACHE_NUM+psize-1, - PGF_CACHENUM_MASK)); + pgtable_free_tlb(tlb, hugepte, pdshift - shift); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling, - unsigned int psize) + unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; @@ -291,7 +242,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) continue; - free_hugepte_range(tlb, (hugepd_t *)pmd, psize); + free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, + addr, next, floor, ceiling); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -317,23 +269,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud_t *pud; unsigned long next; unsigned long start; - unsigned int shift; - unsigned int psize = get_slice_psize(tlb->mm, addr); - shift = mmu_psize_to_shift(psize); start = addr; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); - if (shift < PMD_SHIFT) { + if (!is_hugepd(pud)) { if (pud_none_or_clear_bad(pud)) continue; hugetlb_free_pmd_range(tlb, pud, addr, next, floor, - ceiling, psize); + ceiling); } else { - if (pud_none(*pud)) - continue; - free_hugepte_range(tlb, (hugepd_t *)pud, psize); + free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, + addr, next, floor, ceiling); } } while (pud++, addr = next, addr != end); @@ -364,121 +312,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, { pgd_t *pgd; unsigned long next; - unsigned long start; /* - * Comments below take from the normal free_pgd_range(). They - * apply here too. The tests against HUGEPD_MASK below are - * essential, because we *don't* test for this at the bottom - * level. Without them we'll attempt to free a hugepte table - * when we unmap just part of it, even if there are other - * active mappings using it. - * - * The next few lines have given us lots of grief... - * - * Why are we testing HUGEPD* at this top level? Because - * often there will be no work to do at all, and we'd prefer - * not to go all the way down to the bottom just to discover - * that. - * - * Why all these "- 1"s? Because 0 represents both the bottom - * of the address space and the top of it (using -1 for the - * top wouldn't help much: the masks would do the wrong thing). - * The rule is that addr 0 and floor 0 refer to the bottom of - * the address space, but end 0 and ceiling 0 refer to the top - * Comparisons need to use "end - 1" and "ceiling - 1" (though - * that end 0 case should be mythical). + * Because there are a number of different possible pagetable + * layouts for hugepage ranges, we limit knowledge of how + * things should be laid out to the allocation path + * (huge_pte_alloc(), above). Everything else works out the + * structure as it goes from information in the hugepd + * pointers. That means that we can't here use the + * optimization used in the normal page free_pgd_range(), of + * checking whether we're actually covering a large enough + * range to have to do anything at the top level of the walk + * instead of at the bottom. * - * Wherever addr is brought up or ceiling brought down, we - * must be careful to reject "the opposite 0" before it - * confuses the subsequent tests. But what about where end is - * brought down by HUGEPD_SIZE below? no, end can't go down to - * 0 there. - * - * Whereas we round start (addr) and ceiling down, by different - * masks at different levels, in order to test whether a table - * now has no other vmas using it, so can be freed, we don't - * bother to round floor or end up - the tests don't need that. + * To make sense of this, you should probably go read the big + * block comment at the top of the normal free_pgd_range(), + * too. */ - unsigned int psize = get_slice_psize(tlb->mm, addr); - - addr &= HUGEPD_MASK(psize); - if (addr < floor) { - addr += HUGEPD_SIZE(psize); - if (!addr) - return; - } - if (ceiling) { - ceiling &= HUGEPD_MASK(psize); - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - end -= HUGEPD_SIZE(psize); - if (addr > end - 1) - return; - start = addr; pgd = pgd_offset(tlb->mm, addr); do { - psize = get_slice_psize(tlb->mm, addr); - BUG_ON(!mmu_huge_psizes[psize]); next = pgd_addr_end(addr, end); - if (mmu_psize_to_shift(psize) < PUD_SHIFT) { + if (!is_hugepd(pgd)) { if (pgd_none_or_clear_bad(pgd)) continue; hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); } else { - if (pgd_none(*pgd)) - continue; - free_hugepte_range(tlb, (hugepd_t *)pgd, psize); + free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + addr, next, floor, ceiling); } } while (pgd++, addr = next, addr != end); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - if (pte_present(*ptep)) { - /* We open-code pte_clear because we need to pass the right - * argument to hpte_need_flush (huge / !huge). Might not be - * necessary anymore if we make hpte_need_flush() get the - * page size from the slices - */ - unsigned int psize = get_slice_psize(mm, addr); - unsigned int shift = mmu_psize_to_shift(psize); - unsigned long sz = ((1UL) << shift); - struct hstate *hstate = size_to_hstate(sz); - pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); - } - *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); -} - -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); - return __pte(old); -} - struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { pte_t *ptep; struct page *page; - unsigned int mmu_psize = get_slice_psize(mm, address); + unsigned shift; + unsigned long mask; + + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!mmu_huge_psizes[mmu_psize]) + if (!ptep || !shift) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, address); + mask = (1UL << shift) - 1; page = pte_page(*ptep); - if (page) { - unsigned int shift = mmu_psize_to_shift(mmu_psize); - unsigned long sz = ((1UL) << shift); - page += (address % sz) / PAGE_SIZE; - } + if (page) + page += (address & mask) / PAGE_SIZE; return page; } @@ -501,6 +384,82 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = *ptep; + mask = _PAGE_PRESENT | _PAGE_USER; + if (write) + mask |= _PAGE_RW; + + if ((pte_val(pte) & mask) != mask) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + while (*nr) { + put_page(page); + (*nr)--; + } + } + + return 1; +} + +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, + unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(*hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -509,8 +468,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - if (!mmu_huge_psizes[mmu_psize]) - return -EINVAL; return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); } @@ -521,229 +478,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(psize); } -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, - pte_t pte, int trap, unsigned long sz) +static int __init add_huge_page_size(unsigned long long size) { - struct page *page; - int i; - - if (!pfn_valid(pte_pfn(pte))) - return rflags; - - page = pte_page(pte); - - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - for (i = 0; i < (sz / PAGE_SIZE); i++) - __flush_dcache_icache(page_address(page+i)); - set_bit(PG_arch_1, &page->flags); - } else { - rflags |= HPTE_R_N; - } - } - return rflags; -} + int shift = __ffs(size); + int mmu_psize; -int hash_huge_page(struct mm_struct *mm, unsigned long access, - unsigned long ea, unsigned long vsid, int local, - unsigned long trap) -{ - pte_t *ptep; - unsigned long old_pte, new_pte; - unsigned long va, rflags, pa, sz; - long slot; - int err = 1; - int ssize = user_segment_size(ea); - unsigned int mmu_psize; - int shift; - mmu_psize = get_slice_psize(mm, ea); - - if (!mmu_huge_psizes[mmu_psize]) - goto out; - ptep = huge_pte_offset(mm, ea); - - /* Search the Linux page table for a match with va */ - va = hpt_va(ea, vsid, ssize); + /* Check that it is a page size supported by the hardware and + * that it fits within pagetable and slice limits. */ + if (!is_power_of_2(size) + || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) + return -EINVAL; - /* - * If no pte found or not present, send the problem up to - * do_page_fault - */ - if (unlikely(!ptep || pte_none(*ptep))) - goto out; + if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + return -EINVAL; - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. +#ifdef CONFIG_SPU_FS_64K_LS + /* Disable support for 64K huge pages when 64K SPU local store + * support is enabled as the current implementation conflicts. */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; - /* - * At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - - do { - old_pte = pte_val(*ptep); - if (old_pte & _PAGE_BUSY) - goto out; - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); - - rflags = 0x2 | (!(new_pte & _PAGE_RW)); - /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ - rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); - shift = mmu_psize_to_shift(mmu_psize); - sz = ((1UL) << shift); - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ - rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), - trap, sz); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long hash, slot; - - hash = hpt_hash(va, shift, ssize); - if (old_pte & _PAGE_F_SECOND) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> 12; - - if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, - ssize, local) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(va, shift, ssize); - unsigned long hpte_group; - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - -repeat: - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - - /* clear HPTE slot informations in new PTE */ -#ifdef CONFIG_PPC_64K_PAGES - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; -#else - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; -#endif - /* Add in WIMG bits */ - rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | - _PAGE_COHERENT | _PAGE_GUARDED)); - - /* Insert into the hash table, primary slot */ - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, - mmu_psize, ssize); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, - HPTE_V_SECONDARY, - mmu_psize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP)&~0x7UL; - - ppc_md.hpte_remove(hpte_group); - goto repeat; - } - } - - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); - - new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); - } + if (shift == PAGE_SHIFT_64K) + return -EINVAL; +#endif /* CONFIG_SPU_FS_64K_LS */ - /* - * No need to use ldarx/stdcx here - */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); - err = 0; + /* Return if huge page size has already been setup */ + if (size_to_hstate(size)) + return 0; - out: - return err; -} + hugetlb_add_hstate(shift - PAGE_SHIFT); -static void __init set_huge_psize(int psize) -{ - /* Check that it is a page size supported by the hardware and - * that it fits within pagetable limits. */ - if (mmu_psize_defs[psize].shift && - mmu_psize_defs[psize].shift < SID_SHIFT_1T && - (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || - mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || - mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { - /* Return if huge page size has already been setup or is the - * same as the base page size. */ - if (mmu_huge_psizes[psize] || - mmu_psize_defs[psize].shift == PAGE_SHIFT) - return; - if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) - return; - hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); - - switch (mmu_psize_defs[psize].shift) { - case PAGE_SHIFT_64K: - /* We only allow 64k hpages with 4k base page, - * which was checked above, and always put them - * at the PMD */ - hugepte_shift[psize] = PMD_SHIFT; - break; - case PAGE_SHIFT_16M: - /* 16M pages can be at two different levels - * of pagestables based on base page size */ - if (PAGE_SHIFT == PAGE_SHIFT_64K) - hugepte_shift[psize] = PMD_SHIFT; - else /* 4k base page */ - hugepte_shift[psize] = PUD_SHIFT; - break; - case PAGE_SHIFT_16G: - /* 16G pages are always at PGD level */ - hugepte_shift[psize] = PGDIR_SHIFT; - break; - } - hugepte_shift[psize] -= mmu_psize_defs[psize].shift; - } else - hugepte_shift[psize] = 0; + return 0; } static int __init hugepage_setup_sz(char *str) { unsigned long long size; - int mmu_psize; - int shift; size = memparse(str, &str); - shift = __ffs(size); - mmu_psize = shift_to_mmu_psize(shift); - if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) - set_huge_psize(mmu_psize); - else + if (add_huge_page_size(size) != 0) printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); return 1; @@ -752,41 +526,55 @@ __setup("hugepagesz=", hugepage_setup_sz); static int __init hugetlbpage_init(void) { - unsigned int psize; + int psize; if (!cpu_has_feature(CPU_FTR_16M_PAGE)) return -ENODEV; - /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE - * and adjust PTE_NONCACHE_NUM if the number of supported huge page - * sizes changes. - */ - set_huge_psize(MMU_PAGE_16M); - set_huge_psize(MMU_PAGE_16G); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + unsigned pdshift; - /* Temporarily disable support for 64K huge pages when 64K SPU local - * store support is enabled as the current implementation conflicts. - */ -#ifndef CONFIG_SPU_FS_64K_LS - set_huge_psize(MMU_PAGE_64K); -#endif + if (!mmu_psize_defs[psize].shift) + continue; - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - if (mmu_huge_psizes[psize]) { - pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = - kmem_cache_create( - HUGEPTE_CACHE_NAME(psize), - HUGEPTE_TABLE_SIZE(psize), - HUGEPTE_TABLE_SIZE(psize), - 0, - NULL); - if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) - panic("hugetlbpage_init(): could not create %s"\ - "\n", HUGEPTE_CACHE_NAME(psize)); - } + shift = mmu_psize_to_shift(psize); + + if (add_huge_page_size(1ULL << shift) < 0) + continue; + + if (shift < PMD_SHIFT) + pdshift = PMD_SHIFT; + else if (shift < PUD_SHIFT) + pdshift = PUD_SHIFT; + else + pdshift = PGDIR_SHIFT; + + pgtable_cache_add(pdshift - shift, NULL); + if (!PGT_CACHE(pdshift - shift)) + panic("hugetlbpage_init(): could not create " + "pgtable cache for %d bit pagesize\n", shift); } + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + return 0; } module_init(hugetlbpage_init); + +void flush_dcache_icache_hugepage(struct page *page) +{ + int i; + + BUG_ON(!PageCompound(page)); + + for (i = 0; i < (1UL << compound_order(page)); i++) + __flush_dcache_icache(page_address(page+i)); +} diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 335c578b9cc3..776f28d02b6b 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -41,6 +41,7 @@ #include <linux/module.h> #include <linux/poison.h> #include <linux/lmb.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -119,30 +120,63 @@ static void pmd_ctor(void *addr) memset(addr, 0, PMD_TABLE_SIZE); } -static const unsigned int pgtable_cache_size[2] = { - PGD_TABLE_SIZE, PMD_TABLE_SIZE -}; -static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { -#ifdef CONFIG_PPC_64K_PAGES - "pgd_cache", "pmd_cache", -#else - "pgd_cache", "pud_pmd_cache", -#endif /* CONFIG_PPC_64K_PAGES */ -}; - -#ifdef CONFIG_HUGETLB_PAGE -/* Hugepages need an extra cache per hugepagesize, initialized in - * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT - * is not compile time constant. */ -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; -#else -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; -#endif +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; + +/* + * Create a kmem_cache() for pagetables. This is not used for PTE + * pages - they're linked to struct page, come from the normal free + * pages pool and have a different entry size (see real_pte_t) to + * everything else. Caches created by this function are used for all + * the higher level pagetables, and for hugepage pagetables. + */ +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) +{ + char *name; + unsigned long table_size = sizeof(void *) << shift; + unsigned long align = table_size; + + /* When batching pgtable pointers for RCU freeing, we store + * the index size in the low bits. Table alignment must be + * big enough to fit it. + * + * Likewise, hugeapge pagetable pointers contain a (different) + * shift value in the low bits. All tables must be aligned so + * as to leave enough 0 bits in the address to contain it. */ + unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, + HUGEPD_SHIFT_MASK + 1); + struct kmem_cache *new; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the + * moment, gcc doesn't seem to recognize is_power_of_2 as a + * constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ + + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); + new = kmem_cache_create(name, table_size, align, 0, ctor); + PGT_CACHE(shift) = new; + + pr_debug("Allocated pgtable cache for order %d\n", shift); +} + void pgtable_cache_init(void) { - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) + panic("Couldn't allocate pgtable caches"); + + /* In all current configs, when the PUD index exists it's the + * same size as either the pgd or pmd index. Verify that the + * initialization above has also created a PUD cache. This + * will need re-examiniation if we add new possibilities for + * the pagetable layout. */ + BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 59736317bf0e..b9b152558f9c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -32,6 +32,7 @@ #include <linux/pagemap.h> #include <linux/suspend.h> #include <linux/lmb.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -417,18 +418,26 @@ EXPORT_SYMBOL(flush_dcache_page); void flush_dcache_icache_page(struct page *page) { +#ifdef CONFIG_HUGETLB_PAGE + if (PageCompound(page)) { + flush_dcache_icache_hugepage(page); + return; + } +#endif #ifdef CONFIG_BOOKE - void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); - __flush_dcache_icache(start); - kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + { + void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); + __flush_dcache_icache(start); + kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + } #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) /* On 8xx there is no need to kmap since highmem is not supported */ __flush_dcache_icache(page_address(page)); #else __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); #endif - } + void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index dbeb86ac90cd..b910d37aea1a 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/idr.h> +#include <linux/module.h> #include <asm/mmu_context.h> @@ -32,7 +33,7 @@ static DEFINE_IDR(mmu_context_idr); #define NO_CONTEXT 0 #define MAX_CONTEXT ((1UL << 19) - 1) -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +int __init_new_context(void) { int index; int err; @@ -57,22 +58,41 @@ again: return -ENOMEM; } + return index; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + index = __init_new_context(); + if (index < 0) + return index; + /* The old code would re-promote on fork, we don't do that * when using slices as it could cause problem promoting slices * that have been forced down to 4K */ if (slice_mm_new_context(mm)) slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); mm->context.id = index; return 0; } -void destroy_context(struct mm_struct *mm) +void __destroy_context(int context_id) { spin_lock(&mmu_context_lock); - idr_remove(&mmu_context_idr, mm->context.id); + idr_remove(&mmu_context_idr, context_id); spin_unlock(&mmu_context_lock); +} +EXPORT_SYMBOL_GPL(__destroy_context); +void destroy_context(struct mm_struct *mm) +{ + __destroy_context(mm->context.id); + subpage_prot_free(mm); mm->context.id = NO_CONTEXT; } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d2e5321d5ea6..e27a990af42d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -98,21 +98,10 @@ extern void _tlbia(void); #ifdef CONFIG_PPC32 -struct tlbcam { - u32 MAS0; - u32 MAS1; - u32 MAS2; - u32 MAS3; - u32 MAS7; -}; - extern void mapin_ram(void); extern int map_page(unsigned long va, phys_addr_t pa, int flags); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, int flags); -extern void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags, unsigned int pid); -extern void invalidate_tlbcam_entry(int index); extern int __map_without_bats; extern unsigned long ioremap_base; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 53040931de32..99df697c601a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -49,12 +49,12 @@ struct pte_freelist_batch { struct rcu_head rcu; unsigned int index; - pgtable_free_t tables[0]; + unsigned long tables[0]; }; #define PTE_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ - / sizeof(pgtable_free_t)) + / sizeof(unsigned long)) static void pte_free_smp_sync(void *arg) { @@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg) /* This is only called when we are critically out of memory * (and fail to get a page in pte_free_tlb). */ -static void pgtable_free_now(pgtable_free_t pgf) +static void pgtable_free_now(void *table, unsigned shift) { pte_freelist_forced_free++; smp_call_function(pte_free_smp_sync, NULL, 1); - pgtable_free(pgf); + pgtable_free(table, shift); } static void pte_free_rcu_callback(struct rcu_head *head) @@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head) container_of(head, struct pte_freelist_batch, rcu); unsigned int i; - for (i = 0; i < batch->index; i++) - pgtable_free(batch->tables[i]); + for (i = 0; i < batch->index; i++) { + void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); + } free_page((unsigned long)batch); } @@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch) call_rcu(&batch->rcu, pte_free_rcu_callback); } -void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) { /* This is safe since tlb_gather_mmu has disabled preemption */ struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + unsigned long pgf; if (atomic_read(&tlb->mm->mm_users) < 2 || cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ - pgtable_free(pgf); + pgtable_free(table, shift); return; } if (*batchp == NULL) { *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); if (*batchp == NULL) { - pgtable_free_now(pgf); + pgtable_free_now(table, shift); return; } (*batchp)->index = 0; } + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf = (unsigned long)table | shift; (*batchp)->tables[(*batchp)->index++] = pgf; if ((*batchp)->index == PTE_FREELIST_SIZE) { pte_free_submit(*batchp); diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 4cafc0c33d0a..a040b81e93bd 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -24,9 +24,9 @@ * Also makes sure that the subpage_prot_table structure is * reinitialized for the next user. */ -void subpage_prot_free(pgd_t *pgd) +void subpage_prot_free(struct mm_struct *mm) { - struct subpage_prot_table *spt = pgd_subpage_prot(pgd); + struct subpage_prot_table *spt = &mm->context.spt; unsigned long i, j, addr; u32 **p; @@ -51,6 +51,13 @@ void subpage_prot_free(pgd_t *pgd) spt->maxaddr = 0; } +void subpage_prot_init_new_context(struct mm_struct *mm) +{ + struct subpage_prot_table *spt = &mm->context.spt; + + memset(spt, 0, sizeof(*spt)); +} + static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { @@ -87,7 +94,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); + struct subpage_prot_table *spt = &mm->context.spt; u32 **spm, *spp; int i, nw; unsigned long next, limit; @@ -136,7 +143,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); + struct subpage_prot_table *spt = &mm->context.spt; u32 **spm, *spp; int i, nw; unsigned long next, limit; diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 2b2f35f6985e..282d9306361f 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, i = batch->index; - /* We mask the address for the base page size. Huge pages will - * have applied their own masking already - */ - addr &= PAGE_MASK; - /* Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like @@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, } else psize = pte_pagesize_index(mm, addr, pte); + /* Mask the address for the correct page size */ + addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); + /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); |