diff options
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/powerpc/mm/fsl_booke_mmu.c | 43 | ||||
-rw-r--r-- | arch/powerpc/mm/gup.c | 12 | ||||
-rw-r--r-- | arch/powerpc/mm/hash_utils_64.c | 9 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage-book3e.c | 121 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 400 | ||||
-rw-r--r-- | arch/powerpc/mm/init_32.c | 9 | ||||
-rw-r--r-- | arch/powerpc/mm/mem.c | 8 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_context_hash64.c | 12 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_context_nohash.c | 5 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_decl.h | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/numa.c | 20 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable.c | 3 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_low_64e.S | 24 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_nohash.c | 67 |
15 files changed, 619 insertions, 117 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index bdca46e08382..991ee813d2a8 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-y += hugetlbpage.o obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index f7802c8bba0a..66a6fd38e9cd 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -101,17 +101,17 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa) /* * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; - * in particular size must be a power of 4 between 4k and 256M (or 1G, for cpus - * that support extended page sizes). Note that while some cpus support a - * page size of 4G, we don't allow its use here. + * in particular size must be a power of 4 between 4k and the max supported by + * an implementation; max may further be limited by what can be represented in + * an unsigned long (for example, 32-bit implementations cannot support a 4GB + * size). */ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, unsigned long size, unsigned long flags, unsigned int pid) { - unsigned int tsize, lz; + unsigned int tsize; - asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (size)); - tsize = 21 - lz; + tsize = __ilog2(size) - 10; #ifdef CONFIG_SMP if ((flags & _PAGE_NO_CACHE) == 0) @@ -146,29 +146,36 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, loadcam_entry(index); } +unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) +{ + unsigned int camsize = __ilog2(ram) & ~1U; + unsigned int align = __ffs(virt | phys) & ~1U; + unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf; + + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = max_cam * 2 + 10; + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + return 1UL << camsize; +} + unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) { int i; unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; unsigned long amount_mapped = 0; - unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf; - - /* Convert (4^max) kB to (2^max) bytes */ - max_cam = max_cam * 2 + 10; /* Calculate CAM values */ for (i = 0; ram && i < max_cam_idx; i++) { - unsigned int camsize = __ilog2(ram) & ~1U; - unsigned int align = __ffs(virt | phys) & ~1U; unsigned long cam_sz; - if (camsize > align) - camsize = align; - if (camsize > max_cam) - camsize = max_cam; - - cam_sz = 1UL << camsize; + cam_sz = calc_cam_sz(ram, virt, phys); settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0); ram -= cam_sz; diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index fec13200868f..d7efdbf640c7 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,16 +16,6 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -57,8 +47,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } - if (PageTail(page)) - get_huge_page_tail(page); pages[*nr] = page; (*nr)++; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 8352db050192..2d282186cb45 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -106,9 +106,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M; int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; EXPORT_SYMBOL_GPL(mmu_slb_size); -#ifdef CONFIG_HUGETLB_PAGE -unsigned int HPAGE_SHIFT; -#endif #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif @@ -535,11 +532,11 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -void create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end) { - BUG_ON(htab_bolt_mapping(start, end, __pa(start), + return htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize)); + mmu_kernel_ssize); } int remove_section_mapping(unsigned long start, unsigned long end) diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c new file mode 100644 index 000000000000..343ad0b87261 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -0,0 +1,121 @@ +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include <linux/mm.h> +#include <linux/hugetlb.h> + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { + asm volatile( + "li %0,0\n" + "tlbsx. 0,%1\n" + "bne 1f\n" + "li %0,1\n" + "1:\n" + : "=&r"(found) : "r"(ea)); + } else { + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + } + + return found; +} + +void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int index, lz, ncams; + struct vm_area_struct *vma; +#endif + + if (unlikely(is_kernel_addr(ea))) + return; + +#ifdef CONFIG_PPC_MM_SLICES + psize = mmu_get_tsize(get_slice_psize(mm, ea)); + tsize = mmu_get_psize(psize); + shift = mmu_psize_defs[psize].shift; +#else + vma = find_vma(mm, ea); + psize = vma_mmu_pagesize(vma); /* returns actual size in bytes */ + asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (psize)); + shift = 31 - lz; + tsize = 21 - lz; +#endif + + /* + * We can't be interrupted while we're setting up the MAS + * regusters or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + local_irq_restore(flags); + return; + } + +#ifdef CONFIG_PPC_FSL_BOOK3E + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = __get_cpu_var(next_tlbcam_idx); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __get_cpu_var(next_tlbcam_idx) = tlbcam_index; + else + __get_cpu_var(next_tlbcam_idx)++; +#endif + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { + mtspr(SPRN_MAS7_MAS3, mas7_3); + } else { + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + } + + asm volatile ("tlbwe"); + + local_irq_restore(flags); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0); + +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b9..5964371303ac 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1,7 +1,8 @@ /* - * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * PPC Huge TLB Page Support for Kernel. * * Copyright (C) 2003 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor * * Based on the IA-32 version: * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> @@ -11,24 +12,39 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/hugetlb.h> +#include <linux/of_fdt.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> +#include <asm/setup.h> #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 #define PAGE_SHIFT_16G 34 -#define MAX_NUMBER_GPAGES 1024 +unsigned int HPAGE_SHIFT; -/* Tracks the 16G pages after the device tree is scanned and before the - * huge_boot_pages list is ready. */ -static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; +/* + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready. On 64-bit implementations, this is + * just used to track 16G pages and so is a single array. 32-bit + * implementations may have more than one gpage size due to limitations + * of the memory allocators, so we need multiple arrays + */ +#ifdef CONFIG_PPC64 +#define MAX_NUMBER_GPAGES 1024 +static u64 gpage_freearray[MAX_NUMBER_GPAGES]; static unsigned nr_gpages; - -/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() - * will choke on pointers to hugepte tables, which is handy for - * catching screwups early. */ +#else +#define MAX_NUMBER_GPAGES 128 +struct psize_gpages { + u64 gpage_list[MAX_NUMBER_GPAGES]; + unsigned int nr_gpages; +}; +static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; +#endif static inline int shift_to_mmu_psize(unsigned int shift) { @@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) #define hugepd_none(hpd) ((hpd).pd == 0) -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); - return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return hpd.pd & HUGEPD_SHIFT_MASK; -} - -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) -{ - unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); - pte_t *dir = hugepd_page(*hpdp); - - return dir + idx; -} - pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) { pgd_t *pg; @@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift if (is_hugepd(pm)) hpdp = (hugepd_t *)pm; else if (!pmd_none(*pm)) { - return pte_offset_map(pm, ea); + return pte_offset_kernel(pm, ea); } } } @@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, unsigned long address, unsigned pdshift, unsigned pshift) { - pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), - GFP_KERNEL|__GFP_REPEAT); + struct kmem_cache *cachep; + pte_t *new; + +#ifdef CONFIG_PPC64 + cachep = PGT_CACHE(pdshift - pshift); +#else + int i; + int num_hugepd = 1 << (pshift - pdshift); + cachep = hugepte_cache; +#endif + + new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, return -ENOMEM; spin_lock(&mm->page_table_lock); +#ifdef CONFIG_PPC64 if (!hugepd_none(*hpdp)) - kmem_cache_free(PGT_CACHE(pdshift - pshift), new); + kmem_cache_free(cachep, new); else - hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +#else + /* + * We have multiple higher-level entries that point to the same + * actual pte location. Fill in each as we go and backtrack on error. + * We need all of these so the DTLB pgtable walk code can find the + * right higher-level entry without knowing if it's a hugepage or not. + */ + for (i = 0; i < num_hugepd; i++, hpdp++) { + if (unlikely(!hugepd_none(*hpdp))) + break; + else + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; + } + /* If we bailed from the for loop early, an error occurred, clean up */ + if (i < num_hugepd) { + for (i = i - 1 ; i >= 0; i--, hpdp--) + hpdp->pd = 0; + kmem_cache_free(cachep, new); + } +#endif spin_unlock(&mm->page_table_lock); return 0; } @@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz return hugepte_offset(hpdp, addr, pdshift); } +#ifdef CONFIG_PPC32 +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); + int i; + + if (addr == 0) + return; + + gpage_freearray[idx].nr_gpages = number_of_pages; + + for (i = 0; i < number_of_pages; i++) { + gpage_freearray[idx].gpage_list[i] = addr; + addr += page_size; + } +} + +/* + * Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT); + int nr_gpages = gpage_freearray[idx].nr_gpages; + + if (nr_gpages == 0) + return 0; + +#ifdef CONFIG_HIGHMEM + /* + * If gpages can be in highmem we can't use the trick of storing the + * data structure in the page; allocate space for this + */ + m = alloc_bootmem(sizeof(struct huge_bootmem_page)); + m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; +#else + m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); +#endif + + list_add(&m->list, &huge_boot_pages); + gpage_freearray[idx].nr_gpages = nr_gpages; + gpage_freearray[idx].gpage_list[nr_gpages] = 0; + m->hstate = hstate; + + return 1; +} +/* + * Scan the command line hugepagesz= options for gigantic pages; store those in + * a list that we use to allocate the memory once all options are parsed. + */ + +unsigned long gpage_npages[MMU_PAGE_COUNT]; + +static int __init do_gpage_early_setup(char *param, char *val) +{ + static phys_addr_t size; + unsigned long npages; + + /* + * The hugepagesz and hugepages cmdline options are interleaved. We + * use the size variable to keep track of whether or not this was done + * properly and skip over instances where it is incorrect. Other + * command-line parsing code will issue warnings, so we don't need to. + * + */ + if ((strcmp(param, "default_hugepagesz") == 0) || + (strcmp(param, "hugepagesz") == 0)) { + size = memparse(val, NULL); + } else if (strcmp(param, "hugepages") == 0) { + if (size != 0) { + if (sscanf(val, "%lu", &npages) <= 0) + npages = 0; + gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; + size = 0; + } + } + return 0; +} + + +/* + * This function allocates physical space for pages that are larger than the + * buddy allocator can handle. We want to allocate these in highmem because + * the amount of lowmem is limited. This means that this function MUST be + * called before lowmem_end_addr is set up in MMU_init() in order for the lmb + * allocate to grab highmem. + */ +void __init reserve_hugetlb_gpages(void) +{ + static __initdata char cmdline[COMMAND_LINE_SIZE]; + phys_addr_t size, base; + int i; + + strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup); + + /* + * Walk gpage list in reverse, allocating larger page sizes first. + * Skip over unsupported sizes, or sizes that have 0 gpages allocated. + * When we reach the point in the list where pages are no longer + * considered gpages, we're done. + */ + for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { + if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) + continue; + else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) + break; + + size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); + base = memblock_alloc_base(size * gpage_npages[i], size, + MEMBLOCK_ALLOC_ANYWHERE); + add_gpage(base, size, gpage_npages[i]); + } +} + +#else /* PPC64 */ + /* Build list of addresses of gigantic pages. This function is used in early * boot before the buddy or bootmem allocator is setup. */ -void add_gpage(unsigned long addr, unsigned long page_size, - unsigned long number_of_pages) +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate) m->hstate = hstate; return 1; } +#endif int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } +#ifdef CONFIG_PPC32 +#define HUGEPD_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) + +struct hugepd_freelist { + struct rcu_head rcu; + unsigned int index; + void *ptes[0]; +}; + +static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); + +static void hugepd_free_rcu_callback(struct rcu_head *head) +{ + struct hugepd_freelist *batch = + container_of(head, struct hugepd_freelist, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + kmem_cache_free(hugepte_cache, batch->ptes[i]); + + free_page((unsigned long)batch); +} + +static void hugepd_free(struct mmu_gather *tlb, void *hugepte) +{ + struct hugepd_freelist **batchp; + + batchp = &__get_cpu_var(hugepd_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpumask_equal(mm_cpumask(tlb->mm), + cpumask_of(smp_processor_id()))) { + kmem_cache_free(hugepte_cache, hugepte); + return; + } + + if (*batchp == NULL) { + *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); + (*batchp)->index = 0; + } + + (*batchp)->ptes[(*batchp)->index++] = hugepte; + if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { + call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); + *batchp = NULL; + } +} +#endif + static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, unsigned long start, unsigned long end, unsigned long floor, unsigned long ceiling) { pte_t *hugepte = hugepd_page(*hpdp); - unsigned shift = hugepd_shift(*hpdp); + int i; + unsigned long pdmask = ~((1UL << pdshift) - 1); + unsigned int num_hugepd = 1; + +#ifdef CONFIG_PPC64 + unsigned int shift = hugepd_shift(*hpdp); +#else + /* Note: On 32-bit the hpdp may be the first of several */ + num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); +#endif start &= pdmask; if (start < floor) @@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (end - 1 > ceiling - 1) return; - hpdp->pd = 0; + for (i = 0; i < num_hugepd; i++, hpdp++) + hpdp->pd = 0; + tlb->need_flush = 1; +#ifdef CONFIG_PPC64 pgtable_free_tlb(tlb, hugepte, pdshift - shift); +#else + hugepd_free(tlb, hugepte); +#endif } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, * too. */ - pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); + pgd = pgd_offset(tlb->mm, addr); if (!is_hugepd(pgd)) { if (pgd_none_or_clear_bad(pgd)) continue; hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); } else { +#ifdef CONFIG_PPC32 + /* + * Increment next by the size of the huge mapping since + * on 32-bit there may be more than one entry at the pgd + * level for a single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); +#endif free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, addr, next, floor, ceiling); } - } while (pgd++, addr = next, addr != end); + } while (addr = next, addr != end); } struct page * @@ -390,7 +614,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add { unsigned long mask; unsigned long pte_end; - struct page *head, *page; + struct page *head, *page, *tail; pte_t pte; int refs; @@ -413,6 +637,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -428,10 +653,20 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ - while (*nr) { - put_page(page); - (*nr)--; - } + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; @@ -466,17 +701,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { +#ifdef CONFIG_PPC_MM_SLICES struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); +#else + return get_unmapped_area(file, addr, len, pgoff, flags); +#endif } unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { +#ifdef CONFIG_PPC_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); return 1UL << mmu_psize_to_shift(psize); +#else + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + return huge_page_size(hstate_vma(vma)); +#endif +} + +static inline bool is_power_of_4(unsigned long x) +{ + if (is_power_of_2(x)) + return (__ilog2(x) % 2) ? false : true; + return false; } static int __init add_huge_page_size(unsigned long long size) @@ -486,9 +739,14 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if ((size < PAGE_SIZE) || !is_power_of_4(size)) + return -EINVAL; +#else if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) return -EINVAL; +#endif if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) return -EINVAL; @@ -525,6 +783,46 @@ static int __init hugepage_setup_sz(char *str) } __setup("hugepagesz=", hugepage_setup_sz); +#ifdef CONFIG_FSL_BOOKE +struct kmem_cache *hugepte_cache; +static int __init hugetlbpage_init(void) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + + if (!mmu_psize_defs[psize].shift) + continue; + + shift = mmu_psize_to_shift(psize); + + /* Don't treat normal page sizes as huge... */ + if (shift != PAGE_SHIFT) + if (add_huge_page_size(1ULL << shift) < 0) + continue; + } + + /* + * Create a kmem cache for hugeptes. The bottom bits in the pte have + * size information encoded in them, so align them to allow this + */ + hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), + HUGEPD_SHIFT_MASK + 1, 0, NULL); + if (hugepte_cache == NULL) + panic("%s: Unable to create kmem cache for hugeptes\n", + __func__); + + /* Default hpage size = 4M */ + if (mmu_psize_defs[MMU_PAGE_4M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; + else + panic("%s: Unable to set default huge page size\n", __func__); + + + return 0; +} +#else static int __init hugetlbpage_init(void) { int psize; @@ -567,15 +865,23 @@ static int __init hugetlbpage_init(void) return 0; } - +#endif module_init(hugetlbpage_init); void flush_dcache_icache_hugepage(struct page *page) { int i; + void *start; BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) - __flush_dcache_icache(page_address(page+i)); + for (i = 0; i < (1UL << compound_order(page)); i++) { + if (!PageHighMem(page)) { + __flush_dcache_icache(page_address(page+i)); + } else { + start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE); + __flush_dcache_icache(start); + kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + } + } } diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index c77fef56dad6..161cefde5c15 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -32,6 +32,8 @@ #include <linux/pagemap.h> #include <linux/memblock.h> #include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -44,6 +46,7 @@ #include <asm/tlb.h> #include <asm/sections.h> #include <asm/system.h> +#include <asm/hugetlb.h> #include "mmu_decl.h" @@ -123,6 +126,12 @@ void __init MMU_init(void) /* parse args from command line */ MMU_setup(); + /* + * Reserve gigantic pages for hugetlb. This MUST occur before + * lowmem_end_addr is initialized below. + */ + reserve_hugetlb_gpages(); + if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII memblock.memory.cnt = 1; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 507963ee9aa4..16da595ff402 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -123,7 +123,8 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); - create_section_mapping(start, start + size); + if (create_section_mapping(start, start + size)) + return -EINVAL; /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; @@ -548,4 +549,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ +#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ + && defined(CONFIG_HUGETLB_PAGE) + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma->vm_mm, address, *ptep); +#endif } diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index f599f91255ce..ca988a3d5fb2 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -136,8 +136,8 @@ int use_cop(unsigned long acop, struct mm_struct *mm) if (!mm || !acop) return -EINVAL; - /* We need to make sure mm_users doesn't change */ - down_read(&mm->mmap_sem); + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); spin_lock(mm->context.cop_lockp); if (mm->context.cop_pid == COP_PID_NONE) { @@ -164,7 +164,7 @@ int use_cop(unsigned long acop, struct mm_struct *mm) out: spin_unlock(mm->context.cop_lockp); - up_read(&mm->mmap_sem); + spin_unlock(&mm->page_table_lock); return ret; } @@ -185,8 +185,8 @@ void drop_cop(unsigned long acop, struct mm_struct *mm) if (WARN_ON_ONCE(!mm)) return; - /* We need to make sure mm_users doesn't change */ - down_read(&mm->mmap_sem); + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); spin_lock(mm->context.cop_lockp); mm->context.acop &= ~acop; @@ -213,7 +213,7 @@ void drop_cop(unsigned long acop, struct mm_struct *mm) } spin_unlock(mm->context.cop_lockp); - up_read(&mm->mmap_sem); + spin_unlock(&mm->page_table_lock); } EXPORT_SYMBOL_GPL(drop_cop); diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 336807de550e..5b63bd3da4a9 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -292,6 +292,11 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; +#ifdef CONFIG_PPC_MM_SLICES + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); +#endif + return 0; } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index dd0a2589591d..83eb5d5f53d5 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -142,6 +142,8 @@ extern unsigned long mmu_mapin_ram(unsigned long top); #elif defined(CONFIG_PPC_FSL_BOOK3E) extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx); +extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys); #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); extern unsigned long mmu_mapin_ram(unsigned long top); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 1a7c44d8d669..c7dd4dec4df8 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -709,8 +709,7 @@ static void __init parse_drconf_memory(struct device_node *memory) static int __init parse_numa_properties(void) { - struct device_node *cpu = NULL; - struct device_node *memory = NULL; + struct device_node *memory; int default_nid = 0; unsigned long i; @@ -732,6 +731,7 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + struct device_node *cpu; int nid; cpu = of_get_cpu_node(i, NULL); @@ -750,8 +750,8 @@ static int __init parse_numa_properties(void) } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + + for_each_node_by_type(memory, "memory") { unsigned long start; unsigned long size; int nid; @@ -800,8 +800,9 @@ new_range: } /* - * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory - * property in the ibm,dynamic-reconfiguration-memory node. + * Now do the same thing for each MEMBLOCK listed in the + * ibm,dynamic-memory property in the + * ibm,dynamic-reconfiguration-memory node. */ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (memory) @@ -1187,10 +1188,10 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory, */ int hot_add_node_scn_to_nid(unsigned long scn_addr) { - struct device_node *memory = NULL; + struct device_node *memory; int nid = -1; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + for_each_node_by_type(memory, "memory") { unsigned long start, size; int ranges; const unsigned int *memcell_buf; @@ -1214,11 +1215,12 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr) break; } - of_node_put(memory); if (nid >= 0) break; } + of_node_put(memory); + return nid; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index af40c8768a78..214130a4edc6 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -27,6 +27,7 @@ #include <linux/init.h> #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> @@ -212,7 +213,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { - if (!(vma->vm_flags & VM_HUGETLB)) + if (!is_vm_hugetlb_page(vma)) assert_pte_locked(vma->vm_mm, address); __ptep_set_access_flags(ptep, entry); flush_tlb_page_nohash(vma, address); diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 4ebb34bc01d6..dc4a5f385e41 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -553,24 +553,24 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault #ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault #endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault /* Ok, we're all right, we can now create a kernel translation for * a 4K or 64K page from r16 -> r15. @@ -802,24 +802,24 @@ htw_tlb_miss: rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault #ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault #endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault /* Ok, we're all right, we can now create an indirect entry for * a 1M or 256M page. diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 0ff8ab5702a9..4e13d6f9023e 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -37,14 +37,49 @@ #include <linux/spinlock.h> #include <linux/memblock.h> #include <linux/of_fdt.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/code-patching.h> +#include <asm/hugetlb.h> #include "mmu_decl.h" -#ifdef CONFIG_PPC_BOOK3E +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_FSL_BOOKE +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_4M] = { + .shift = 22, + .enc = BOOK3E_PAGESZ_4M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_64M] = { + .shift = 26, + .enc = BOOK3E_PAGESZ_64M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#else struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -78,6 +113,8 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .enc = BOOK3E_PAGESZ_1GB, }, }; +#endif /* CONFIG_FSL_BOOKE */ + static inline int mmu_get_tsize(int psize) { return mmu_psize_defs[psize].enc; @@ -88,7 +125,7 @@ static inline int mmu_get_tsize(int psize) /* This isn't used on !Book3E for now */ return 0; } -#endif +#endif /* CONFIG_PPC_BOOK3E_MMU */ /* The variables below are currently only used on 64-bit Book3E * though this will probably be made common with other nohash @@ -267,6 +304,11 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, mmu_get_tsize(mmu_virtual_psize), 0); } @@ -601,13 +643,28 @@ void __cpuinit early_init_mmu_secondary(void) void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - /* On Embedded 64-bit, we adjust the RMA size to match + /* On non-FSL Embedded 64-bit, we adjust the RMA size to match * the bolted TLB entry. We know for now that only 1G * entries are supported though that may eventually - * change. We crop it to the size of the first MEMBLOCK to + * change. + * + * on FSL Embedded 64-bit, we adjust the RMA size to match the + * first bolted TLB entry size. We still limit max to 1G even if + * the TLB could cover more. This is due to what the early init + * code is setup to do. + * + * We crop it to the size of the first MEMBLOCK to * avoid going over total available memory just in case... */ - ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned long linear_sz; + linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET, + first_memblock_base); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + } else +#endif + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); /* Finally limit subsequent allocations */ memblock_set_current_limit(first_memblock_base + ppc64_rma_size); |