From 61f77eda9bbf0d2e922197ed2dcf88638a639ce5 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:25:15 -0800 Subject: mm/hugetlb: reduce arch dependent code around follow_huge_* Currently we have many duplicates in definitions around follow_huge_addr(), follow_huge_pmd(), and follow_huge_pud(), so this patch tries to remove the m. The basic idea is to put the default implementation for these functions in mm/hugetlb.c as weak symbols (regardless of CONFIG_ARCH_WANT_GENERAL_HUGETL B), and to implement arch-specific code only when the arch needs it. For follow_huge_addr(), only powerpc and ia64 have their own implementation, and in all other architectures this function just returns ERR_PTR(-EINVAL). So this patch sets returning ERR_PTR(-EINVAL) as default. As for follow_huge_(pmd|pud)(), if (pmd|pud)_huge() is implemented to always return 0 in your architecture (like in ia64 or sparc,) it's never called (the callsite is optimized away) no matter how implemented it is. So in such architectures, we don't need arch-specific implementation. In some architecture (like mips, s390 and tile,) their current arch-specific follow_huge_(pmd|pud)() are effectively identical with the common code, so this patch lets these architecture use the common code. One exception is metag, where pmd_huge() could return non-zero but it expects follow_huge_pmd() to always return NULL. This means that we need arch-specific implementation which returns NULL. This behavior looks strange to me (because non-zero pmd_huge() implies that the architecture supports PMD-based hugepage, so follow_huge_pmd() can/should return some relevant value,) but that's beyond this cleanup patch, so let's keep it. Justification of non-trivial changes: - in s390, follow_huge_pmd() checks !MACHINE_HAS_HPAGE at first, and this patch removes the check. This is OK because we can assume MACHINE_HAS_HPAGE is true when follow_huge_pmd() can be called (note that pmd_huge() has the same check and always returns 0 for !MACHINE_HAS_HPAGE.) - in s390 and mips, we use HPAGE_MASK instead of PMD_MASK as done in common code. This patch forces these archs use PMD_MASK, but it's OK because they are identical in both archs. In s390, both of HPAGE_SHIFT and PMD_SHIFT are 20. In mips, HPAGE_SHIFT is defined as (PAGE_SHIFT + PAGE_SHIFT - 3) and PMD_SHIFT is define as (PAGE_SHIFT + PAGE_SHIFT + PTE_ORDER - 3), but PTE_ORDER is always 0, so these are identical. Signed-off-by: Naoya Horiguchi Acked-by: Hugh Dickins Cc: James Hogan Cc: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Luiz Capitulino Cc: Nishanth Aravamudan Cc: Lee Schermerhorn Cc: Steve Capper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/hugetlbpage.c | 6 ------ arch/arm64/mm/hugetlbpage.c | 6 ------ arch/ia64/mm/hugetlbpage.c | 6 ------ arch/metag/mm/hugetlbpage.c | 6 ------ arch/mips/mm/hugetlbpage.c | 18 ------------------ arch/powerpc/mm/hugetlbpage.c | 8 ++++++++ arch/s390/mm/hugetlbpage.c | 20 -------------------- arch/sh/mm/hugetlbpage.c | 12 ------------ arch/sparc/mm/hugetlbpage.c | 12 ------------ arch/tile/mm/hugetlbpage.c | 28 ---------------------------- arch/x86/mm/hugetlbpage.c | 12 ------------ 11 files changed, 8 insertions(+), 126 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 66781bf34077..c72412415093 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -36,12 +36,6 @@ * of type casting from pmd_t * to pte_t *. */ -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - int pud_huge(pud_t pud) { return 0; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 023747bf4dd7..2de9d2e59d96 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -38,12 +38,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) } #endif -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return !(pmd_val(pmd) & PMD_TABLE_BIT); diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 76069c18ee42..52b7604b5215 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -114,12 +114,6 @@ int pud_huge(pud_t pud) return 0; } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) -{ - return NULL; -} - void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 3c32075d2945..7ca80ac42ed5 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -94,12 +94,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 0; } -struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return pmd_page_shift(pmd) > PAGE_SHIFT; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 4ec8ee10d371..06e0f421b41b 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -68,12 +68,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return (pmd_val(pmd) & _PAGE_HUGE) != 0; @@ -83,15 +77,3 @@ int pud_huge(pud_t pud) { return (pud_val(pud) & _PAGE_HUGE) != 0; } - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - struct page *page; - - page = pte_page(*(pte_t *)pmd); - if (page) - page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); - return page; -} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 5ff4e07d920a..cf0464f4284f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -714,6 +714,14 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 3c80d2e38f03..210ffede0153 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -192,12 +192,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 0; } -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { if (!MACHINE_HAS_HPAGE) @@ -210,17 +204,3 @@ int pud_huge(pud_t pud) { return 0; } - -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmdp, int write) -{ - struct page *page; - - if (!MACHINE_HAS_HPAGE) - return NULL; - - page = pmd_page(*pmdp); - if (page) - page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); - return page; -} diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index d7762349ea48..534bc978af8a 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -67,12 +67,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 0; } -struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return 0; @@ -82,9 +76,3 @@ int pud_huge(pud_t pud) { return 0; } - -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - return NULL; -} diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index d329537739c6..4242eab12e10 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -215,12 +215,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return 0; @@ -230,9 +224,3 @@ int pud_huge(pud_t pud) { return 0; } - -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - return NULL; -} diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 3270e0019266..8416240c322c 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -150,12 +150,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return NULL; } -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE); @@ -166,28 +160,6 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - struct page *page; - - page = pte_page(*(pte_t *)pmd); - if (page) - page += ((address & ~PMD_MASK) >> PAGE_SHIFT); - return page; -} - -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - struct page *page; - - page = pte_page(*(pte_t *)pud); - if (page) - page += ((address & ~PUD_MASK) >> PAGE_SHIFT); - return page; -} - int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index bca0aa3a003f..f48423f10141 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -52,20 +52,8 @@ int pud_huge(pud_t pud) return 0; } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - return NULL; -} #else -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_PSE); -- cgit v1.2.3 From cbef8478bee55775ac312a574aad48af7bb9cf9f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:25:19 -0800 Subject: mm/hugetlb: pmd_huge() returns true for non-present hugepage Migrating hugepages and hwpoisoned hugepages are considered as non-present hugepages, and they are referenced via migration entries and hwpoison entries in their page table slots. This behavior causes race condition because pmd_huge() doesn't tell non-huge pages from migrating/hwpoisoned hugepages. follow_page_mask() is one example where the kernel would call follow_page_pte() for such hugepage while this function is supposed to handle only normal pages. To avoid this, this patch makes pmd_huge() return true when pmd_none() is true *and* pmd_present() is false. We don't have to worry about mixing up non-present pmd entry with normal pmd (pointing to leaf level pte entry) because pmd_present() is true in normal pmd. The same race condition could happen in (x86-specific) gup_pmd_range(), where this patch simply adds pmd_present() check instead of pmd_huge(). This is because gup_pmd_range() is fast path. If we have non-present hugepage in this function, we will go into gup_huge_pmd(), then return 0 at flag mask check, and finally fall back to the slow path. Fixes: 290408d4a2 ("hugetlb: hugepage migration core") Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Cc: James Hogan Cc: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Luiz Capitulino Cc: Nishanth Aravamudan Cc: Lee Schermerhorn Cc: Steve Capper Cc: [2.6.36+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 2 +- arch/x86/mm/hugetlbpage.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index d7547824e763..224b14235e96 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -172,7 +172,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, */ if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; - if (unlikely(pmd_large(pmd))) { + if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f48423f10141..42982b26e32b 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -54,9 +54,15 @@ int pud_huge(pud_t pud) #else +/* + * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ int pmd_huge(pmd_t pmd) { - return !!(pmd_val(pmd) & _PAGE_PSE); + return !pmd_none(pmd) && + (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } int pud_huge(pud_t pud) -- cgit v1.2.3 From 4ecf886045152d2ddf98ae74e39f789868ac1f98 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:25:35 -0800 Subject: sparc32: fix broken set_pte() 32-bit sparc uses swap instruction to implement set_pte(). It called using GCC inline assembler. But it misses the "memory" clobber to indicate that pte value will be updated in memory. As result GCC doesn't know that it cannot postpone pte pointer dereference which occurs before set_pte() to post-set_pte() time. It leads to real-world bugs -- [1]. In this situation we have code: ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); ... ptep_modify_prot_commit(mm, addr, pte, ptent); ptep_modify_prot_start() in sparc case is just 'pte' dereference plus pte_clear(). pte_clear() calls broken set_pte(). GCC thinks it's valid to dereference 'pte' again on pte_modify() and gets cleared pte. ptep_modify_prot_commit() puts 'pteent' with pfn==0 back to page table, which eventually leads to the crash. [1] http://lkml.kernel.org/r/54C06B19.8060305@roeck-us.net Signed-off-by: Kirill A. Shutemov Reported-by: Guenter Roeck Tested-by: Guenter Roeck Cc: Paul Moore Cc: Joonsoo Kim Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_32.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index b2f7dc46a7d1..9912eb0b499a 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -102,7 +102,8 @@ extern unsigned long empty_zero_page; */ static inline unsigned long srmmu_swap(unsigned long *addr, unsigned long value) { - __asm__ __volatile__("swap [%2], %0" : "=&r" (value) : "0" (value), "r" (addr)); + __asm__ __volatile__("swap [%2], %0" : + "=&r" (value) : "0" (value), "r" (addr) : "memory"); return value; } -- cgit v1.2.3 From 3ae3ad4e639234a43fd3997887524d2e5345fa76 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:38 -0800 Subject: microblaze: define __PAGETABLE_PMD_FOLDED Microblaze uses custom implementation of PMD folding, but doesn't define __PAGETABLE_PMD_FOLDED, which generic code expects to see. Let's fix it. Defining __PAGETABLE_PMD_FOLDED will drop out unused __pmd_alloc(). It also fixes problems with recently-introduced pmd accounting. Signed-off-by: Kirill A. Shutemov Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 91b9b46fbb5d..c6b6af4ca2a0 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -61,6 +61,8 @@ extern int mem_init_done; #include +#define __PAGETABLE_PMD_FOLDED + #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -- cgit v1.2.3 From d016bf7ece53b2b947bfd769e0842fd2feb7556b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:41 -0800 Subject: mm: make FIRST_USER_ADDRESS unsigned long on all archs LKP has triggered a compiler warning after my recent patch "mm: account pmd page tables to the process": mm/mmap.c: In function 'exit_mmap': >> mm/mmap.c:2857:2: warning: right shift count >= width of type [enabled by default] The code: > 2857 WARN_ON(mm_nr_pmds(mm) > 2858 round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); In this, on tile, we have FIRST_USER_ADDRESS defined as 0. round_up() has the same type -- int. PUD_SHIFT. I think the best way to fix it is to define FIRST_USER_ADDRESS as unsigned long. On every arch for consistency. Signed-off-by: Kirill A. Shutemov Reported-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 2 +- arch/arc/include/asm/pgtable.h | 2 +- arch/arm/include/asm/pgtable-nommu.h | 2 +- arch/arm64/include/asm/pgtable.h | 2 +- arch/avr32/include/asm/pgtable.h | 2 +- arch/cris/include/asm/pgtable.h | 2 +- arch/frv/include/asm/pgtable.h | 2 +- arch/hexagon/include/asm/pgtable.h | 2 +- arch/ia64/include/asm/pgtable.h | 2 +- arch/m32r/include/asm/pgtable.h | 2 +- arch/m68k/include/asm/pgtable_mm.h | 2 +- arch/microblaze/include/asm/pgtable.h | 2 +- arch/mips/include/asm/pgtable-32.h | 2 +- arch/mn10300/include/asm/pgtable.h | 2 +- arch/nios2/include/asm/pgtable.h | 2 +- arch/openrisc/include/asm/pgtable.h | 2 +- arch/parisc/include/asm/pgtable.h | 2 +- arch/powerpc/include/asm/pgtable-ppc32.h | 2 +- arch/powerpc/include/asm/pgtable-ppc64.h | 2 +- arch/s390/include/asm/pgtable.h | 2 +- arch/score/include/asm/pgtable.h | 2 +- arch/sh/include/asm/pgtable.h | 2 +- arch/sparc/include/asm/pgtable_32.h | 2 +- arch/sparc/include/asm/pgtable_64.h | 2 +- arch/tile/include/asm/pgtable.h | 2 +- arch/um/include/asm/pgtable-2level.h | 2 +- arch/um/include/asm/pgtable-3level.h | 2 +- arch/x86/include/asm/pgtable_types.h | 2 +- arch/xtensa/include/asm/pgtable.h | 2 +- 29 files changed, 29 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index fce22cf88ee9..a9a119592372 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -45,7 +45,7 @@ struct vm_area_struct; #define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3)) #define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* Number of pointers that fit on a page: this will go away. */ #define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3)) diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index bdc8ccaf390d..ffed3b2cf313 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -211,7 +211,7 @@ * No special requirements for lowest virtual address we permit any user space * mapping to be mapped at. */ -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /**************************************************************** diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index c35e53ee6663..add094d09e3e 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -85,7 +85,7 @@ extern unsigned int kobjsize(const void *objp); #define VMALLOC_START 0UL #define VMALLOC_END 0xffffffffUL -#define FIRST_USER_ADDRESS (0) +#define FIRST_USER_ADDRESS 0UL #include diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4c445057169d..3e4d3c43632a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -45,7 +45,7 @@ #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #ifndef __ASSEMBLY__ extern void __pte_error(const char *file, int line, unsigned long val); diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h index ac7a817e2126..35800664076e 100644 --- a/arch/avr32/include/asm/pgtable.h +++ b/arch/avr32/include/asm/pgtable.h @@ -30,7 +30,7 @@ #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #ifndef __ASSEMBLY__ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h index e824257971c4..ceefc314d64d 100644 --- a/arch/cris/include/asm/pgtable.h +++ b/arch/cris/include/asm/pgtable.h @@ -67,7 +67,7 @@ extern void paging_init(void); */ #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* zero page used for uninitialized stuff */ #ifndef __ASSEMBLY__ diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h index c49699d5902d..93bcf2abd1a1 100644 --- a/arch/frv/include/asm/pgtable.h +++ b/arch/frv/include/asm/pgtable.h @@ -140,7 +140,7 @@ extern unsigned long empty_zero_page; #define PTRS_PER_PTE 4096 #define USER_PGDS_IN_LAST_PML4 (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 6e35e71d2aea..49eab8136ec3 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -171,7 +171,7 @@ extern unsigned long _dflt_cache_att; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */ /* Seems to be zero even in architectures where the zero page is firewalled? */ -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define pte_special(pte) 0 #define pte_mkspecial(pte) (pte) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 2f07bb3dda91..7b6f8801df57 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -127,7 +127,7 @@ #define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT #define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT) #define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */ -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* * All the normal masks have the "page accessed" bits on, as any time diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h index 050f7a686e3d..8c1fb902a9ce 100644 --- a/arch/m32r/include/asm/pgtable.h +++ b/arch/m32r/include/asm/pgtable.h @@ -53,7 +53,7 @@ extern unsigned long empty_zero_page[1024]; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #ifndef __ASSEMBLY__ /* Just any arbitrary offset to the start of the vmalloc VM area: the diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index 9f5abbda1ea7..28a145bfbb71 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -66,7 +66,7 @@ #define PTRS_PER_PGD 128 #endif #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* Virtual address region for use by kernel_map() */ #ifdef CONFIG_SUN3 diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index c6b6af4ca2a0..e53b8532353c 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -72,7 +72,7 @@ extern int mem_init_done; #include #include -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL extern unsigned long va_to_phys(unsigned long address); extern pte_t *va_to_pte(unsigned long address); diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 16aa9f23e17b..a6be006b6f75 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -57,7 +57,7 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, #define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define VMALLOC_START MAP_BASE diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h index 629181ae111e..afab728ab65e 100644 --- a/arch/mn10300/include/asm/pgtable.h +++ b/arch/mn10300/include/asm/pgtable.h @@ -65,7 +65,7 @@ extern void paging_init(void); #define PGDIR_MASK (~(PGDIR_SIZE - 1)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 7b292e3a3138..a213e8c9aad0 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -24,7 +24,7 @@ #include #include -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE #define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 18994ccb1185..69c7df0e1420 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -77,7 +77,7 @@ extern void paging_init(void); */ #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* * Kernels own virtual memory area. diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 1d49a4a7749b..8c966b2270aa 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -134,7 +134,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); * pgd entries used up by user/kernel: */ -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* NB: The tlb miss handlers make certain assumptions about the order */ /* of the following bits, so be careful (One example, bits 25-31 */ diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index 234e07c47803..e48e3292e713 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -45,7 +45,7 @@ extern int icache_44x_need_flush; #define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index b9dcc936e2d1..d46532ccc386 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -12,7 +12,7 @@ #endif #include -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL /* * Size of EA range mapped by our pagetables. diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ffb1d8ce97ae..aabcd3f62d3b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -99,7 +99,7 @@ extern unsigned long zero_page_mask; #endif /* CONFIG_64BIT */ #define PTRS_PER_PGD 2048 -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h index 5170ffdea643..0553e5cd5985 100644 --- a/arch/score/include/asm/pgtable.h +++ b/arch/score/include/asm/pgtable.h @@ -27,7 +27,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; #define PTRS_PER_PTE 1024 #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define VMALLOC_START (0xc0000000UL) diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index cf434c64408d..89c513a982fc 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -62,7 +62,7 @@ static inline unsigned long long neff_sign_extend(unsigned long val) /* Entries per level */ #define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE)) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define PHYS_ADDR_MASK29 0x1fffffff #define PHYS_ADDR_MASK32 0xffffffff diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 9912eb0b499a..f06b36a00a3b 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -44,7 +44,7 @@ unsigned long __init bootmem_init(unsigned long *pages_avail); #define PTRS_PER_PMD SRMMU_PTRS_PER_PMD #define PTRS_PER_PGD SRMMU_PTRS_PER_PGD #define USER_PTRS_PER_PGD PAGE_OFFSET / SRMMU_PGDIR_SIZE -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define PTE_SIZE (PTRS_PER_PTE*4) #define PAGE_NONE SRMMU_PAGE_NONE diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 2ac7873ad6fd..dc165ebdf05a 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -93,7 +93,7 @@ bool kern_addr_valid(unsigned long addr); #define PTRS_PER_PGD (1UL << PGDIR_BITS) /* Kernel has a separate 44bit address space. */ -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \ diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index bc75b6ef2e79..95a4f19d16c5 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -67,7 +67,7 @@ extern void pgtable_cache_init(void); extern void paging_init(void); extern void set_page_homes(void); -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define _PAGE_PRESENT HV_PTE_PRESENT #define _PAGE_HUGE_PAGE HV_PTE_PAGE diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index 7afe86035fa7..cfbe59752469 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -23,7 +23,7 @@ #define PTRS_PER_PTE 1024 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) #define PTRS_PER_PGD 1024 -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \ diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 344c559c0a17..2b4274e7c095 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h @@ -41,7 +41,7 @@ #endif #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 5185a4f599ec..3e0230c94cff 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -4,7 +4,7 @@ #include #include -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 01b80dce9d65..a5e929a10c20 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -57,7 +57,7 @@ #define PTRS_PER_PGD 1024 #define PGD_ORDER 0 #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT) /* -- cgit v1.2.3 From 8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:47 -0800 Subject: arm: define __PAGETABLE_PMD_FOLDED for !LPAE ARM uses custom implementation of PMD folding in 2-level page table case. Generic code expects to see __PAGETABLE_PMD_FOLDED to be defined if PMD is folded, but ARM doesn't do this. Let's fix it. Defining __PAGETABLE_PMD_FOLDED will drop out unused __pmd_alloc(). It also fixes problems with recently-introduced pmd accounting on ARM without LPAE. Signed-off-by: Kirill A. Shutemov Reported-by: Nishanth Menon Reported-by: Simon Horman Tested-by: Simon Horman Tested-by: Fabio Estevam Tested-by: Felipe Balbi Tested-by: Nishanth Menon Tested-by: Peter Ujfalusi Tested-by: Krzysztof Kozlowski Tested-by: Geert Uytterhoeven Cc: Dave Hansen Cc: Hugh Dickins Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgtable-2level.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index bcc5e300413f..bfd662e49a25 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -10,6 +10,8 @@ #ifndef _ASM_PGTABLE_2LEVEL_H #define _ASM_PGTABLE_2LEVEL_H +#define __PAGETABLE_PMD_FOLDED + /* * Hardware-wise, we have a two level page table structure, where the first * level has 4096 entries, and the second level has 256 entries. Each entry -- cgit v1.2.3 From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:50 -0800 Subject: mm: account pmd page tables to the process Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include #include #include #include #include #include #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov Reported-by: Dave Hansen Cc: Hugh Dickins Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..7b22adaad4f1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; @@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) pmd = NULL; failed = true; } + if (pmd) + mm_inc_nr_pmds(mm); pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds); return -ENOMEM; } @@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); } } } @@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (preallocate_pmds(mm, pmds) != 0) goto out_free_pgd; if (paravirt_pgd_alloc(mm) != 0) @@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; out_free_pmds: - free_pmds(pmds); + free_pmds(mm, pmds); out_free_pgd: free_page((unsigned long)pgd); out: -- cgit v1.2.3 From b30fe6c7ced70f62862c3d09357e7e8084e98d9f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:53 -0800 Subject: mm: fix false-positive warning on exit due mm_nr_pmds(mm) The problem is that we check nr_ptes/nr_pmds in exit_mmap() which happens *before* pgd_free(). And if an arch does pte/pmd allocation in pgd_alloc() and frees them in pgd_free() we see offset in counters by the time of the checks. We tried to workaround this by offsetting expected counter value according to FIRST_USER_ADDRESS for both nr_pte and nr_pmd in exit_mmap(). But it doesn't work in some cases: 1. ARM with LPAE enabled also has non-zero USER_PGTABLES_CEILING, but upper addresses occupied with huge pmd entries, so the trick with offsetting expected counter value will get really ugly: we will have to apply it nr_pmds, but not nr_ptes. 2. Metag has non-zero FIRST_USER_ADDRESS, but doesn't do allocation pte/pmd page tables allocation in pgd_alloc(), just setup a pgd entry which is allocated at boot and shared accross all processes. The proposal is to move the check to check_mm() which happens *after* pgd_free() and do proper accounting during pgd_alloc() and pgd_free() which would bring counters to zero if nothing leaked. Signed-off-by: Kirill A. Shutemov Reported-by: Tyler Baker Tested-by: Tyler Baker Tested-by: Nishanth Menon Cc: Russell King Cc: James Hogan Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 4 ++++ arch/unicore32/mm/pgd.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 249379535be2..a3681f11dd9f 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -97,6 +97,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) no_pte: pmd_free(mm, new_pmd); + mm_dec_nr_pmds(mm); no_pmd: pud_free(mm, new_pud); no_pud: @@ -130,9 +131,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); + atomic_long_dec(&mm->nr_ptes); no_pmd: pud_clear(pud); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); no_pud: pgd_clear(pgd); pud_free(mm, pud); @@ -152,6 +155,7 @@ no_pgd: pmd = pmd_offset(pud, 0); pud_clear(pud); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); pgd_clear(pgd); pud_free(mm, pud); } diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index 08b8d4295e70..2ade20d8eab3 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -69,6 +69,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) no_pte: pmd_free(mm, new_pmd); + mm_dec_nr_pmds(mm); no_pmd: free_pages((unsigned long)new_pgd, 0); no_pgd: @@ -96,7 +97,9 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); + atomic_long_dec(&mm->nr_ptes); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); free: free_pages((unsigned long) pgd, 0); } -- cgit v1.2.3 From a7b780750e1a1c7833812681e1f8fa30bbb06802 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 11 Feb 2015 15:27:23 -0800 Subject: mm: gup: use get_user_pages_unlocked within get_user_pages_fast This allows the get_user_pages_fast slow path to release the mmap_sem before blocking. Signed-off-by: Andrea Arcangeli Reviewed-by: Kirill A. Shutemov Cc: Andres Lagar-Cavilla Cc: Peter Feiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 8 +++----- arch/s390/mm/gup.c | 6 ++---- arch/sh/mm/gup.c | 6 ++---- arch/sparc/mm/gup.c | 6 ++---- arch/x86/mm/gup.c | 7 +++---- 5 files changed, 12 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 70795a67a276..349995d19c7f 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -301,11 +301,9 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, - write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, + write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 639fce464008..5c586c78ca8d 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -235,10 +235,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - nr_pages - nr, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + nr_pages - nr, write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) ret = (ret < 0) ? nr : ret + nr; diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 37458f38b220..e15f52a17b6c 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -257,10 +257,8 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index ae6ce383d4df..2e5c4fc2daa9 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -249,10 +249,8 @@ slow: start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 224b14235e96..89df70e0caa6 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -388,10 +388,9 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, + write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { -- cgit v1.2.3 From 1757bbd9c5918a9c1a8757141763a23f2e446caa Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:28:00 -0800 Subject: arch/powerpc/mm/subpage-prot.c: use walk->vma and walk_page_vma() We don't have to use mm_walk->private to pass vma to the callback function because of mm_walk->vma. And walk_page_vma() is useful if we walk over a single vma. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/subpage-prot.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 6c0b1f5f8d2c..fa9fb5b4c66c 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -134,7 +134,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct vm_area_struct *vma = walk->vma; split_huge_page_pmd(vma, addr, pmd); return 0; } @@ -163,9 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, if (vma->vm_start >= (addr + len)) break; vma->vm_flags |= VM_NOHUGEPAGE; - subpage_proto_walk.private = vma; - walk_page_range(vma->vm_start, vma->vm_end, - &subpage_proto_walk); + walk_page_vma(vma, &subpage_proto_walk); vma = vma->vm_next; } } -- cgit v1.2.3