From d8c1bdeb5d6b62b34a78391206a5e55e4a02d58f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:13 -0800 Subject: page-flags: trivial cleanup for PageTrans* helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use TESTPAGEFLAG_FALSE() to get it a bit cleaner. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index bb53c7b86315..45792d01edea 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -485,21 +485,9 @@ static inline int PageTransTail(struct page *page) } #else - -static inline int PageTransHuge(struct page *page) -{ - return 0; -} - -static inline int PageTransCompound(struct page *page) -{ - return 0; -} - -static inline int PageTransTail(struct page *page) -{ - return 0; -} +TESTPAGEFLAG_FALSE(TransHuge) +TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransTail) #endif /* -- cgit v1.2.3 From 0e6d31a7336f41ef0375f5398c79e54de8e219b6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:17 -0800 Subject: page-flags: move code around MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preparation patch: we are going to use compound_head(), PageTail() and PageCompound() to define page-flags helpers. Let's define them before macros. We cannot user PageHead() helper in PageCompound() as it's not yet defined -- use test_bit(PG_head, &page->flags) instead. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 45792d01edea..83161a22509c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -133,6 +133,27 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +struct page; /* forward declaration */ + +static inline struct page *compound_head(struct page *page) +{ + unsigned long head = READ_ONCE(page->compound_head); + + if (unlikely(head & 1)) + return (struct page *) (head - 1); + return page; +} + +static inline int PageTail(struct page *page) +{ + return READ_ONCE(page->compound_head) & 1; +} + +static inline int PageCompound(struct page *page) +{ + return test_bit(PG_head, &page->flags) || PageTail(page); +} + /* * Macros to create function definitions for page flags */ @@ -204,7 +225,6 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) -struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) @@ -395,11 +415,6 @@ static inline void set_page_writeback_keepwrite(struct page *page) __PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) -static inline int PageTail(struct page *page) -{ - return READ_ONCE(page->compound_head) & 1; -} - static inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); @@ -410,20 +425,6 @@ static inline void clear_compound_head(struct page *page) WRITE_ONCE(page->compound_head, 0); } -static inline struct page *compound_head(struct page *page) -{ - unsigned long head = READ_ONCE(page->compound_head); - - if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; -} - -static inline int PageCompound(struct page *page) -{ - return PageHead(page) || PageTail(page); - -} #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { -- cgit v1.2.3 From 95ad97554ac81b31139d4fe5ed8757a07087cd90 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:21 -0800 Subject: page-flags: introduce page flags policies wrt compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a third argument to macros which create function definitions for page flags. This argument defines how page-flags helpers behave on compound functions. For now we define four policies: - PF_ANY: the helper function operates on the page it gets, regardless if it's non-compound, head or tail. - PF_HEAD: the helper function operates on the head page of the compound page if it gets tail page. - PF_NO_TAIL: only head and non-compond pages are acceptable for this helper function. - PF_NO_COMPOUND: only non-compound pages are acceptable for this helper function. For now we use policy PF_ANY for all helpers, which matches current behaviour. We do not enforce the policy for TESTPAGEFLAG, because we have flags checked for random pages all over the kernel. Noticeable exception to this is PageTransHuge() which triggers VM_BUG_ON() for tail page. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 6 ++ include/linux/page-flags.h | 166 ++++++++++++++++++++++++++++----------------- 2 files changed, 108 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 772362adf471..053824b0a412 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -56,4 +56,10 @@ void dump_mm(const struct mm_struct *mm); #define VIRTUAL_BUG_ON(cond) do { } while (0) #endif +#ifdef CONFIG_DEBUG_VM_PGFLAGS +#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page) +#else +#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond) +#endif + #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 83161a22509c..12ab023b67f2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -154,49 +154,80 @@ static inline int PageCompound(struct page *page) return test_bit(PG_head, &page->flags) || PageTail(page); } +/* + * Page flags policies wrt compound pages + * + * PF_ANY: + * the page flag is relevant for small, head and tail pages. + * + * PF_HEAD: + * for compound page all operations related to the page flag applied to + * head page. + * + * PF_NO_TAIL: + * modifications of the page flag must be done on small or head pages, + * checks can be done on tail pages too. + * + * PF_NO_COMPOUND: + * the page flag is not relevant for compound pages. + */ +#define PF_ANY(page, enforce) page +#define PF_HEAD(page, enforce) compound_head(page) +#define PF_NO_TAIL(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ + compound_head(page);}) +#define PF_NO_COMPOUND(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ + page;}) + /* * Macros to create function definitions for page flags */ -#define TESTPAGEFLAG(uname, lname) \ -static inline int Page##uname(const struct page *page) \ - { return test_bit(PG_##lname, &page->flags); } +#define TESTPAGEFLAG(uname, lname, policy) \ +static inline int Page##uname(struct page *page) \ + { return test_bit(PG_##lname, &policy(page, 0)->flags); } -#define SETPAGEFLAG(uname, lname) \ +#define SETPAGEFLAG(uname, lname, policy) \ static inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &page->flags); } + { set_bit(PG_##lname, &policy(page, 1)->flags); } -#define CLEARPAGEFLAG(uname, lname) \ +#define CLEARPAGEFLAG(uname, lname, policy) \ static inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &page->flags); } + { clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __SETPAGEFLAG(uname, lname) \ +#define __SETPAGEFLAG(uname, lname, policy) \ static inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &page->flags); } + { __set_bit(PG_##lname, &policy(page, 1)->flags); } -#define __CLEARPAGEFLAG(uname, lname) \ +#define __CLEARPAGEFLAG(uname, lname, policy) \ static inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &page->flags); } + { __clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTSETFLAG(uname, lname) \ +#define TESTSETFLAG(uname, lname, policy) \ static inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &page->flags); } + { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTCLEARFLAG(uname, lname) \ +#define TESTCLEARFLAG(uname, lname, policy) \ static inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &page->flags); } + { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __TESTCLEARFLAG(uname, lname) \ +#define __TESTCLEARFLAG(uname, lname, policy) \ static inline int __TestClearPage##uname(struct page *page) \ - { return __test_and_clear_bit(PG_##lname, &page->flags); } + { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) +#define PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + SETPAGEFLAG(uname, lname, policy) \ + CLEARPAGEFLAG(uname, lname, policy) -#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) +#define __PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + __SETPAGEFLAG(uname, lname, policy) \ + __CLEARPAGEFLAG(uname, lname, policy) -#define TESTSCFLAG(uname, lname) \ - TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTSCFLAG(uname, lname, policy) \ + TESTSETFLAG(uname, lname, policy) \ + TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname) \ static inline int Page##uname(const struct page *page) { return 0; } @@ -225,46 +256,48 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) - -TESTPAGEFLAG(Locked, locked) -PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) -PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) - __SETPAGEFLAG(Referenced, referenced) -PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) -PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) -PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) - TESTCLEARFLAG(Active, active) -__PAGEFLAG(Slab, slab) -PAGEFLAG(Checked, checked) /* Used by some filesystems */ -PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ -PAGEFLAG(SavePinned, savepinned); /* Xen */ -PAGEFLAG(Foreign, foreign); /* Xen */ -PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) -PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) - __SETPAGEFLAG(SwapBacked, swapbacked) - -__PAGEFLAG(SlobFree, slob_free) +TESTPAGEFLAG(Locked, locked, PF_ANY) +PAGEFLAG(Error, error, PF_ANY) TESTCLEARFLAG(Error, error, PF_ANY) +PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) + __SETPAGEFLAG(Referenced, referenced, PF_ANY) +PAGEFLAG(Dirty, dirty, PF_ANY) TESTSCFLAG(Dirty, dirty, PF_ANY) + __CLEARPAGEFLAG(Dirty, dirty, PF_ANY) +PAGEFLAG(LRU, lru, PF_ANY) __CLEARPAGEFLAG(LRU, lru, PF_ANY) +PAGEFLAG(Active, active, PF_ANY) __CLEARPAGEFLAG(Active, active, PF_ANY) + TESTCLEARFLAG(Active, active, PF_ANY) +__PAGEFLAG(Slab, slab, PF_ANY) +PAGEFLAG(Checked, checked, PF_ANY) /* Used by some filesystems */ +PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ +PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ +PAGEFLAG(Foreign, foreign, PF_ANY); /* Xen */ +PAGEFLAG(Reserved, reserved, PF_ANY) __CLEARPAGEFLAG(Reserved, reserved, PF_ANY) +PAGEFLAG(SwapBacked, swapbacked, PF_ANY) + __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) + __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) + +__PAGEFLAG(SlobFree, slob_free, PF_ANY) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause releasepage() and co to be invoked */ -PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) - __CLEARPAGEFLAG(Private, private) -PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2) -PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) +PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) + __CLEARPAGEFLAG(Private, private, PF_ANY) +PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) + TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -PAGEFLAG(MappedToDisk, mappedtodisk) +TESTPAGEFLAG(Writeback, writeback, PF_ANY) TESTSCFLAG(Writeback, writeback, PF_ANY) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_ANY) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) +PAGEFLAG(Reclaim, reclaim, PF_ANY) TESTCLEARFLAG(Reclaim, reclaim, PF_ANY) +PAGEFLAG(Readahead, reclaim, PF_ANY) TESTCLEARFLAG(Readahead, reclaim, PF_ANY) #ifdef CONFIG_HIGHMEM /* @@ -277,31 +310,32 @@ PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP -PAGEFLAG(SwapCache, swapcache) +PAGEFLAG(SwapCache, swapcache, PF_ANY) #else PAGEFLAG_FALSE(SwapCache) #endif -PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) - TESTCLEARFLAG(Unevictable, unevictable) +PAGEFLAG(Unevictable, unevictable, PF_ANY) + __CLEARPAGEFLAG(Unevictable, unevictable, PF_ANY) + TESTCLEARFLAG(Unevictable, unevictable, PF_ANY) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) - TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) +PAGEFLAG(Mlocked, mlocked, PF_ANY) __CLEARPAGEFLAG(Mlocked, mlocked, PF_ANY) + TESTSCFLAG(Mlocked, mlocked, PF_ANY) __TESTCLEARFLAG(Mlocked, mlocked, PF_ANY) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached) +PAGEFLAG(Uncached, uncached, PF_ANY) #else PAGEFLAG_FALSE(Uncached) #endif #ifdef CONFIG_MEMORY_FAILURE -PAGEFLAG(HWPoison, hwpoison) -TESTSCFLAG(HWPoison, hwpoison) +PAGEFLAG(HWPoison, hwpoison, PF_ANY) +TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) @@ -309,10 +343,10 @@ PAGEFLAG_FALSE(HWPoison) #endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) -TESTPAGEFLAG(Young, young) -SETPAGEFLAG(Young, young) -TESTCLEARFLAG(Young, young) -PAGEFLAG(Idle, idle) +TESTPAGEFLAG(Young, young, PF_ANY) +SETPAGEFLAG(Young, young, PF_ANY) +TESTCLEARFLAG(Young, young, PF_ANY) +PAGEFLAG(Idle, idle, PF_ANY) #endif /* @@ -393,7 +427,7 @@ static inline void SetPageUptodate(struct page *page) set_bit(PG_uptodate, &(page)->flags); } -CLEARPAGEFLAG(Uptodate, uptodate) +CLEARPAGEFLAG(Uptodate, uptodate, PF_ANY) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); @@ -413,7 +447,7 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) +__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) static inline void set_compound_head(struct page *page, struct page *head) { @@ -615,6 +649,10 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +#undef PF_ANY +#undef PF_HEAD +#undef PF_NO_TAIL +#undef PF_NO_COMPOUND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ -- cgit v1.2.3 From 48c935ad88f5be20eb5445a77c171351b1eb5111 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:24 -0800 Subject: page-flags: define PG_locked behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lock_page() must operate on the whole compound page. It doesn't make much sense to lock part of compound page. Change code to use head page's PG_locked, if tail page is passed. This patch also gets rid of custom helper functions -- __set_page_locked() and __clear_page_locked(). They are replaced with helpers generated by __SETPAGEFLAG/__CLEARPAGEFLAG. Tail pages to these helper would trigger VM_BUG_ON(). SLUB uses PG_locked as a bit spin locked. IIUC, tail pages should never appear there. VM_BUG_ON() is added to make sure that this assumption is correct. [akpm@linux-foundation.org: fix fs/cifs/file.c] Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- include/linux/pagemap.h | 25 ++++++++----------------- 2 files changed, 9 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 12ab023b67f2..32c87eb470cb 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -256,7 +256,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) -TESTPAGEFLAG(Locked, locked, PF_ANY) +__PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Error, error, PF_ANY) TESTCLEARFLAG(Error, error, PF_ANY) PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) __SETPAGEFLAG(Referenced, referenced, PF_ANY) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26eabf5ec718..df214a4b886d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -433,18 +433,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); -static inline void __set_page_locked(struct page *page) -{ - __set_bit(PG_locked, &page->flags); -} - -static inline void __clear_page_locked(struct page *page) -{ - __clear_bit(PG_locked, &page->flags); -} - static inline int trylock_page(struct page *page) { + page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } @@ -497,9 +488,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page, static inline int wait_on_page_locked_killable(struct page *page) { - if (PageLocked(page)) - return wait_on_page_bit_killable(page, PG_locked); - return 0; + if (!PageLocked(page)) + return 0; + return wait_on_page_bit_killable(compound_head(page), PG_locked); } extern wait_queue_head_t *page_waitqueue(struct page *page); @@ -518,7 +509,7 @@ static inline void wake_up_page(struct page *page, int bit) static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + wait_on_page_bit(compound_head(page), PG_locked); } /* @@ -664,17 +655,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: - * the page is new, so we can just run __set_page_locked() against it. + * the page is new, so we can just run __SetPageLocked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; - __set_page_locked(page); + __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) - __clear_page_locked(page); + __ClearPageLocked(page); return error; } -- cgit v1.2.3 From df8c94d13c7e30f4471f8faa8d544809a0e52865 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:28 -0800 Subject: page-flags: define behavior of FS/IO-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems we don't have compound page on FS/IO path currently. Use PF_NO_COMPOUND to catch if we have. The odd exception is PG_dirty: sound uses compound pages and maps them with PTEs. PF_NO_COMPOUND triggers VM_BUG_ON() in set_page_dirty() on handling shared fault. Let's use PF_HEAD for PG_dirty. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 32c87eb470cb..2493f80b949b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -257,16 +257,16 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) -PAGEFLAG(Error, error, PF_ANY) TESTCLEARFLAG(Error, error, PF_ANY) +PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) __SETPAGEFLAG(Referenced, referenced, PF_ANY) -PAGEFLAG(Dirty, dirty, PF_ANY) TESTSCFLAG(Dirty, dirty, PF_ANY) - __CLEARPAGEFLAG(Dirty, dirty, PF_ANY) +PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) + __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_ANY) __CLEARPAGEFLAG(LRU, lru, PF_ANY) PAGEFLAG(Active, active, PF_ANY) __CLEARPAGEFLAG(Active, active, PF_ANY) TESTCLEARFLAG(Active, active, PF_ANY) __PAGEFLAG(Slab, slab, PF_ANY) -PAGEFLAG(Checked, checked, PF_ANY) /* Used by some filesystems */ +PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ PAGEFLAG(Foreign, foreign, PF_ANY); /* Xen */ @@ -292,12 +292,15 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback, PF_ANY) TESTSCFLAG(Writeback, writeback, PF_ANY) -PAGEFLAG(MappedToDisk, mappedtodisk, PF_ANY) +TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) + TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim, PF_ANY) TESTCLEARFLAG(Reclaim, reclaim, PF_ANY) -PAGEFLAG(Readahead, reclaim, PF_ANY) TESTCLEARFLAG(Readahead, reclaim, PF_ANY) +PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -413,7 +416,7 @@ static inline int PageUptodate(struct page *page) static inline void __SetPageUptodate(struct page *page) { smp_wmb(); - __set_bit(PG_uptodate, &(page)->flags); + __set_bit(PG_uptodate, &page->flags); } static inline void SetPageUptodate(struct page *page) @@ -424,7 +427,7 @@ static inline void SetPageUptodate(struct page *page) * uptodate are actually visible before PageUptodate becomes true. */ smp_wmb(); - set_bit(PG_uptodate, &(page)->flags); + set_bit(PG_uptodate, &page->flags); } CLEARPAGEFLAG(Uptodate, uptodate, PF_ANY) -- cgit v1.2.3 From 8cb38fabb6bc1ba8bcec83eaf04848d886b54d28 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:32 -0800 Subject: page-flags: define behavior of LRU-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only head pages are ever on LRU. Let's use PF_HEAD policy to avoid any confusion for all LRU-related flags. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2493f80b949b..88a3bcba57d6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -258,13 +258,14 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) -PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) - __SETPAGEFLAG(Referenced, referenced, PF_ANY) +PAGEFLAG(Referenced, referenced, PF_HEAD) + TESTCLEARFLAG(Referenced, referenced, PF_HEAD) + __SETPAGEFLAG(Referenced, referenced, PF_HEAD) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) -PAGEFLAG(LRU, lru, PF_ANY) __CLEARPAGEFLAG(LRU, lru, PF_ANY) -PAGEFLAG(Active, active, PF_ANY) __CLEARPAGEFLAG(Active, active, PF_ANY) - TESTCLEARFLAG(Active, active, PF_ANY) +PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) +PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) + TESTCLEARFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_ANY) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ @@ -318,9 +319,9 @@ PAGEFLAG(SwapCache, swapcache, PF_ANY) PAGEFLAG_FALSE(SwapCache) #endif -PAGEFLAG(Unevictable, unevictable, PF_ANY) - __CLEARPAGEFLAG(Unevictable, unevictable, PF_ANY) - TESTCLEARFLAG(Unevictable, unevictable, PF_ANY) +PAGEFLAG(Unevictable, unevictable, PF_HEAD) + __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) + TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU PAGEFLAG(Mlocked, mlocked, PF_ANY) __CLEARPAGEFLAG(Mlocked, mlocked, PF_ANY) -- cgit v1.2.3 From dcb351cd095a3a1e1100b74f15a0100cf9a0c700 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:35 -0800 Subject: page-flags: define behavior SL*B-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SL*B uses compound pages and marks head pages with PG_slab. __SetPageSlab() and __ClearPageSlab() are never called for tail pages. The same situation with PG_slob_free in SLOB allocator. PF_NO_TAIL is appropriate for these flags. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 88a3bcba57d6..29d8805aaa23 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -266,7 +266,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) -__PAGEFLAG(Slab, slab, PF_ANY) +__PAGEFLAG(Slab, slab, PF_NO_TAIL) +__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ @@ -276,8 +277,6 @@ PAGEFLAG(SwapBacked, swapbacked, PF_ANY) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) -__PAGEFLAG(SlobFree, slob_free, PF_ANY) - /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. -- cgit v1.2.3 From c13985fa800312fdcd4c7d67a1f55abcbc2f6b7d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:39 -0800 Subject: page-flags: define behavior of Xen-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PG_pinned and PG_savepinned are about page table's pages which are never compound. I'm not so sure about PG_foreign, but it seems we shouldn't see compound pages there too. Let's use PF_NO_COMPOUND for all of them. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 29d8805aaa23..6e7c7c66b6ca 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -269,9 +269,13 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ -PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ -PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ -PAGEFLAG(Foreign, foreign, PF_ANY); /* Xen */ + +/* Xen */ +PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) + TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) +PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); +PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); + PAGEFLAG(Reserved, reserved, PF_ANY) __CLEARPAGEFLAG(Reserved, reserved, PF_ANY) PAGEFLAG(SwapBacked, swapbacked, PF_ANY) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) -- cgit v1.2.3 From de09d31dd38a50fdce106c15abd68432eebbd014 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:42 -0800 Subject: page-flags: define PG_reserved behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As far as I can see there's no users of PG_reserved on compound pages. Let's use PF_NO_COMPOUND here. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6e7c7c66b6ca..dbfd8f325f98 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -276,7 +276,8 @@ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); -PAGEFLAG(Reserved, reserved, PF_ANY) __CLEARPAGEFLAG(Reserved, reserved, PF_ANY) +PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_ANY) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) -- cgit v1.2.3 From da5efc408baefd686b0ee2cbd1353eb10ec71a0f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:46 -0800 Subject: page-flags: define PG_swapbacked behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PG_swapbacked is used for transparent huge pages. For head pages only. Let's use PF_NO_TAIL policy. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dbfd8f325f98..eda487ecc01c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -278,9 +278,9 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) -PAGEFLAG(SwapBacked, swapbacked, PF_ANY) - __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) - __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) +PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page -- cgit v1.2.3 From 50ea78d676d4282a403f956229505b9fddf69f3a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:49 -0800 Subject: page-flags: define PG_swapcache behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Swap cannot handle compound pages so far. Transparent huge pages are split on the way to swap. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index eda487ecc01c..7fc2ea83cbd5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -318,7 +318,7 @@ PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP -PAGEFLAG(SwapCache, swapcache, PF_ANY) +PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(SwapCache) #endif -- cgit v1.2.3 From e4f87d5d752d259b274681420b010c65006301a6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:53 -0800 Subject: page-flags: define PG_mlocked behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Transparent huge pages can be mlocked -- whole compund page at once. Something went wrong if we're trying to mlock() tail page. Let's use PF_NO_TAIL. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7fc2ea83cbd5..43b7acb092ff 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,10 @@ PAGEFLAG(Unevictable, unevictable, PF_HEAD) TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked, PF_ANY) __CLEARPAGEFLAG(Mlocked, mlocked, PF_ANY) - TESTSCFLAG(Mlocked, mlocked, PF_ANY) __TESTCLEARFLAG(Mlocked, mlocked, PF_ANY) +PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) + __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) + TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) + __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) -- cgit v1.2.3 From b9d418170aefb020ebd4c60040d69c4399851aa3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:56 -0800 Subject: page-flags: define PG_uncached behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far, only IA64 uses PG_uncached and only on non-compound pages. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 43b7acb092ff..dff90852fbc6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -338,7 +338,7 @@ PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached, PF_ANY) +PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(Uncached) #endif -- cgit v1.2.3 From d2998c4de2937b964ea63aa2c08183f28462d532 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:00 -0800 Subject: page-flags: define PG_uptodate behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We use PG_uptodate on head pages on transparent huge page. Let's use PF_NO_TAIL. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dff90852fbc6..818fa39538a9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -404,8 +404,9 @@ u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) { - int ret = test_bit(PG_uptodate, &(page)->flags); - + int ret; + page = compound_head(page); + ret = test_bit(PG_uptodate, &(page)->flags); /* * Must ensure that the data we read out of the page is loaded * _after_ we've loaded page->flags to check for PageUptodate. @@ -422,12 +423,14 @@ static inline int PageUptodate(struct page *page) static inline void __SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); __set_bit(PG_uptodate, &page->flags); } static inline void SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the page @@ -437,7 +440,7 @@ static inline void SetPageUptodate(struct page *page) set_bit(PG_uptodate, &page->flags); } -CLEARPAGEFLAG(Uptodate, uptodate, PF_ANY) +CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); -- cgit v1.2.3 From 822cdd1152265d87fcfc974e06c3b68f762987fd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:03 -0800 Subject: page-flags: look at head page if the flag is encoded in page->mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PageAnon() and PageKsm() look at lower bits of page->mapping to check if the page is Anon or KSM. page->mapping can be overloaded in tail pages. Let's always look at head page to avoid false-positives. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 818fa39538a9..190f1915a097 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -176,7 +176,7 @@ static inline int PageCompound(struct page *page) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ compound_head(page);}) -#define PF_NO_COMPOUND(page, enforce) ({ \ +#define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ page;}) @@ -381,6 +381,7 @@ PAGEFLAG(Idle, idle, PF_ANY) static inline int PageAnon(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } @@ -393,6 +394,7 @@ static inline int PageAnon(struct page *page) */ static inline int PageKsm(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); } -- cgit v1.2.3 From 1c290f642101e64f379e38ea0361d097c08e824d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:07 -0800 Subject: mm: sanitize page->mapping for tail pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't define meaning of page->mapping for tail pages. Currently it's always NULL, which can be inconsistent with head page and potentially lead to problems. Let's poison the pointer to catch all illigal uses. page_rmapping(), page_mapping() and page_anon_vma() are changed to look on head page. The only illegal use I've caught so far is __GPF_COMP pages from sound subsystem, mapped with PTEs. do_shared_fault() is changed to use page_rmapping() instead of direct access to fault_page->mapping. Signed-off-by: Kirill A. Shutemov Reviewed-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 317e16de09e5..76c3b6c38c16 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -32,6 +32,10 @@ /********** mm/debug-pagealloc.c **********/ #define PAGE_POISON 0xaa +/********** mm/page_alloc.c ************/ + +#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA) + /********** mm/slab.c **********/ /* * Magic nums for obj red zoning. -- cgit v1.2.3 From 685eaade56c66c806dbe8102f12e2926cf4ec870 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:10 -0800 Subject: page-flags: drop __TestClearPage*() helpers Nobody uses them. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 190f1915a097..7bc7fd9c4c5c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -211,10 +211,6 @@ static inline int TestSetPage##uname(struct page *page) \ static inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __TESTCLEARFLAG(uname, lname, policy) \ -static inline int __TestClearPage##uname(struct page *page) \ - { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } - #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ @@ -247,9 +243,6 @@ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } -#define __TESTCLEARFLAG_FALSE(uname) \ -static inline int __TestClearPage##uname(struct page *page) { return 0; } - #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) @@ -331,10 +324,9 @@ PAGEFLAG(Unevictable, unevictable, PF_HEAD) PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) - __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) - TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) + TESTSCFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -- cgit v1.2.3 From d281ee6145183594788ab6d5b55f8d144e69eace Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:16 -0800 Subject: rmap: add argument to charge compound page We're going to allow mapping of individual 4k pages of THP compound page. It means we cannot rely on PageTransHuge() check to decide if map/unmap small page or THP. The patch adds new argument to rmap functions to indicate whether we want to operate on whole compound page or only the small page. [n-horiguchi@ah.jp.nec.com: fix mapcount mismatch in hugepage migration] Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 29446aeef36e..038b6e704d9b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -161,16 +161,22 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *page_get_anon_vma(struct page *page); +/* bitflags for do_page_add_anon_rmap() */ +#define RMAP_EXCLUSIVE 0x01 +#define RMAP_COMPOUND 0x02 + /* * rmap interfaces called when adding or removing pte of page */ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void page_add_file_rmap(struct page *); -void page_remove_rmap(struct page *); +void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -- cgit v1.2.3 From f627c2f53786b0445abca47f6aa84c96a1fffec2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:20 -0800 Subject: memcg: adjust to support new THP refcounting As with rmap, with new refcounting we cannot rely on PageTransHuge() to check if we need to charge size of huge page form the cgroup. We need to get information from caller to know whether it was mapped with PMD or PTE. We do uncharge when last reference on the page gone. At that point if we see PageTransHuge() it means we need to unchange whole huge page. The tricky part is partial unmap -- when we try to unmap part of huge page. We don't do a special handing of this situation, meaning we don't uncharge the part of huge page unless last user is gone or split_huge_page() is triggered. In case of cgroup memory pressure happens the partial unmapped page will be split through shrinker. This should be good enough. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2292468f2a30..189f04d4d2ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg, bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp); + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); + bool lrucare, bool compound); +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root, static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcgp) + struct mem_cgroup **memcgp, + bool compound) { *memcgp = NULL; return 0; @@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { } static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, + bool compound) { } -- cgit v1.2.3 From 1f25fe20a76af0d960172fb104d4b13697cafa84 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:24 -0800 Subject: mm, thp: adjust conditions when we can reuse the page on WP fault With new refcounting we will be able map the same compound page with PTEs and PMDs. It requires adjustment to conditions when we can reuse the page on write-protection fault. For PTE fault we can't reuse the page if it's part of huge page. For PMD we can only reuse the page if nobody else maps the huge page or it's part. We can do it by checking page_mapcount() on each sub-page, but it's expensive. The cheaper way is to check page_count() to be equal 1: every mapcount takes page reference, so this way we can guarantee, that the PMD is the only mapping. This approach can give false negative if somebody pinned the page, but that doesn't affect correctness. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 066bd21765ad..a282933c5bc6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -538,7 +538,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) (page_mapcount(page) == 1) +#define reuse_swap_page(page) \ + (!PageTransCompound(page) && page_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { -- cgit v1.2.3 From 78ddc53473419073ffb2e91178001e87bc513524 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:42 -0800 Subject: thp: rename split_huge_page_pmd() to split_huge_pmd() We are going to decouple splitting THP PMD from splitting underlying compound page. This patch renames split_huge_page_pmd*() functions to split_huge_pmd*() to reflect the fact that it doesn't imply page splitting, only PMD. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ecb080d6ff42..805c7ae42280 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -102,7 +102,7 @@ static inline int split_huge_page(struct page *page) } extern void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd); -#define split_huge_page_pmd(__vma, __address, __pmd) \ +#define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (unlikely(pmd_trans_huge(*____pmd))) \ @@ -117,8 +117,6 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ } while (0) -extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd); #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -183,11 +181,9 @@ static inline int split_huge_page(struct page *page) { return 0; } -#define split_huge_page_pmd(__vma, __address, __pmd) \ - do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -#define split_huge_page_pmd_mm(__mm, __address, __pmd) \ +#define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) -- cgit v1.2.3 From 122afea9626ab3f717b250a8dd3d5ebf57cdb56c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:46 -0800 Subject: mm, vmstats: new THP splitting event The patch replaces THP_SPLIT with tree events: THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED and THP_SPLIT_PMD. It reflects the fact that we are going to be able split PMD without the compound page and that split_huge_page() can fail. Signed-off-by: Kirill A. Shutemov Acked-by: Christoph Lameter Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e623d392db0c..e1f8c993e73b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -68,7 +68,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_FAULT_FALLBACK, THP_COLLAPSE_ALLOC, THP_COLLAPSE_ALLOC_FAILED, - THP_SPLIT, + THP_SPLIT_PAGE, + THP_SPLIT_PAGE_FAILED, + THP_SPLIT_PMD, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif -- cgit v1.2.3 From ad0bed24e98bcae9952c2d1f663ec7cb6344a387 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:53 -0800 Subject: thp: drop all split_huge_page()-related code We will re-introduce new version with new refcounting later in patchset. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 805c7ae42280..9df5802faadf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -95,28 +95,12 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); #endif /* CONFIG_DEBUG_VM */ extern unsigned long transparent_hugepage_flags; -extern int split_huge_page_to_list(struct page *page, struct list_head *list); -static inline int split_huge_page(struct page *page) -{ - return split_huge_page_to_list(page, NULL); -} -extern void __split_huge_page_pmd(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd); -#define split_huge_pmd(__vma, __pmd, __address) \ - do { \ - pmd_t *____pmd = (__pmd); \ - if (unlikely(pmd_trans_huge(*____pmd))) \ - __split_huge_page_pmd(__vma, __address, \ - ____pmd); \ - } while (0) -#define wait_split_huge_page(__anon_vma, __pmd) \ - do { \ - pmd_t *____pmd = (__pmd); \ - anon_vma_lock_write(__anon_vma); \ - anon_vma_unlock_write(__anon_vma); \ - BUG_ON(pmd_trans_splitting(*____pmd) || \ - pmd_trans_huge(*____pmd)); \ - } while (0) + +#define split_huge_page_to_list(page, list) BUILD_BUG() +#define split_huge_page(page) BUILD_BUG() +#define split_huge_pmd(__vma, __pmd, __address) BUILD_BUG() + +#define wait_split_huge_page(__anon_vma, __pmd) BUILD_BUG() #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -- cgit v1.2.3 From ddc58f27f9eee9117219936f77e90ad5b2e00e96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:56 -0800 Subject: mm: drop tail page refcounting Tail page refcounting is utterly complicated and painful to support. It uses ->_mapcount on tail pages to store how many times this page is pinned. get_page() bumps ->_mapcount on tail page in addition to ->_count on head. This information is required by split_huge_page() to be able to distribute pins from head of compound page to tails during the split. We will need ->_mapcount to account PTE mappings of subpages of the compound page. We eliminate need in current meaning of ->_mapcount in tail pages by forbidding split entirely if the page is pinned. The only user of tail page refcounting is THP which is marked BROKEN for now. Let's drop all this mess. It makes get_page() and put_page() much simpler. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 47 ++++++++++------------------------------------- include/linux/mm_types.h | 17 +++-------------- 2 files changed, 13 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 839d9e9a1c38..34387351930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -466,44 +466,9 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline bool __compound_tail_refcounted(struct page *page) -{ - return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); -} - -/* - * This takes a head page as parameter and tells if the - * tail page reference counting can be skipped. - * - * For this to be safe, PageSlab and PageHeadHuge must remain true on - * any given page where they return true here, until all tail pins - * have been released. - */ -static inline bool compound_tail_refcounted(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return __compound_tail_refcounted(page); -} - -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run from under us. - */ - VM_BUG_ON_PAGE(!PageTail(page), page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - if (compound_tail_refcounted(compound_head(page))) - atomic_inc(&page->_mapcount); -} - -extern bool __get_page_tail(struct page *page); - static inline void get_page(struct page *page) { - if (unlikely(PageTail(page))) - if (likely(__get_page_tail(page))) - return; + page = compound_head(page); /* * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. @@ -528,7 +493,15 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } -void put_page(struct page *page); +void __put_page(struct page *page); + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + if (put_page_testzero(page)) + __put_page(page); +} + void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6bc9a0ce2253..faf6fe88d6b3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -81,20 +81,9 @@ struct page { union { /* - * Count of ptes mapped in - * mms, to show when page is - * mapped & limit reverse map - * searches. - * - * Used also for tail pages - * refcounting instead of - * _count. Tail pages cannot - * be mapped and keeping the - * tail page _count zero at - * all times guarantees - * get_page_unless_zero() will - * never succeed on tail - * pages. + * Count of ptes mapped in mms, to show + * when page is mapped & limit reverse + * map searches. */ atomic_t _mapcount; -- cgit v1.2.3 From 3ac808fdd2b835547af81de75c813cf7ba2ab58f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:07 -0800 Subject: mm, thp: remove compound_lock() We are going to use migration entries to stabilize page counts. It means we don't need compound_lock() for that. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 35 ----------------------------------- include/linux/page-flags.h | 12 +----------- 2 files changed, 1 insertion(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 34387351930c..70f59de2e288 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -410,41 +410,6 @@ static inline int is_vmalloc_or_module_addr(const void *x) extern void kvfree(const void *addr); -static inline void compound_lock(struct page *page) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(PageSlab(page), page); - bit_spin_lock(PG_compound_lock, &page->flags); -#endif -} - -static inline void compound_unlock(struct page *page) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(PageSlab(page), page); - bit_spin_unlock(PG_compound_lock, &page->flags); -#endif -} - -static inline unsigned long compound_lock_irqsave(struct page *page) -{ - unsigned long uninitialized_var(flags); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - local_irq_save(flags); - compound_lock(page); -#endif - return flags; -} - -static inline void compound_unlock_irqrestore(struct page *page, - unsigned long flags) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - compound_unlock(page); - local_irq_restore(flags); -#endif -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7bc7fd9c4c5c..0c42acca0338 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -101,9 +101,6 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - PG_compound_lock, -#endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) PG_young, PG_idle, @@ -613,12 +610,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) #define __PG_MLOCKED 0 #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) -#else -#define __PG_COMPOUND_LOCK 0 -#endif - /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -628,8 +619,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | \ - __PG_COMPOUND_LOCK) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. -- cgit v1.2.3 From 4b471e8898c3d0f5c97a3c73ac32d0549fe01c87 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:39 -0800 Subject: mm, thp: remove infrastructure for handling splitting PMDs With new refcounting we don't need to mark PMDs splitting. Let's drop code to handle this. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9df5802faadf..333b058b1e3d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,7 +25,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); -extern int move_huge_pmd(struct vm_area_struct *vma, +extern bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, @@ -48,15 +48,9 @@ enum transparent_hugepage_flag { #endif }; -enum page_check_address_pmd_flag { - PAGE_CHECK_ADDRESS_PMD_FLAG, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, -}; extern pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag, spinlock_t **ptl); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) @@ -100,7 +94,6 @@ extern unsigned long transparent_hugepage_flags; #define split_huge_page(page) BUILD_BUG() #define split_huge_pmd(__vma, __pmd, __address) BUILD_BUG() -#define wait_split_huge_page(__anon_vma, __pmd) BUILD_BUG() #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -110,17 +103,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl); /* mmap_sem must be held on entry */ -static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (pmd_trans_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma, ptl); else - return 0; + return false; } static inline int hpage_nr_pages(struct page *page) { @@ -165,8 +158,6 @@ static inline int split_huge_page(struct page *page) { return 0; } -#define wait_split_huge_page(__anon_vma, __pmd) \ - do { } while (0) #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, @@ -181,10 +172,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { - return 0; + return false; } static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From 53f9263baba69fc1630e3c780c4d11b72643f962 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:42 -0800 Subject: mm: rework mapcount accounting to enable 4k mapping of THPs We're going to allow mapping of individual 4k pages of THP compound. It means we need to track mapcount on per small page basis. Straight-forward approach is to use ->_mapcount in all subpages to track how many time this subpage is mapped with PMDs or PTEs combined. But this is rather expensive: mapping or unmapping of a THP page with PMD would require HPAGE_PMD_NR atomic operations instead of single we have now. The idea is to store separately how many times the page was mapped as whole -- compound_mapcount. This frees up ->_mapcount in subpages to track PTE mapcount. We use the same approach as with compound page destructor and compound order to store compound_mapcount: use space in first tail page, ->mapping this time. Any time we map/unmap whole compound page (THP or hugetlb) -- we increment/decrement compound_mapcount. When we map part of compound page with PTE we operate on ->_mapcount of the subpage. page_mapcount() counts both: PTE and PMD mappings of the page. Basically, we have mapcount for a subpage spread over two counters. It makes tricky to detect when last mapcount for a page goes away. We introduced PageDoubleMap() for this. When we split THP PMD for the first time and there's other PMD mapping left we offset up ->_mapcount in all subpages by one and set PG_double_map on the compound page. These additional references go away with last compound_mapcount. This approach provides a way to detect when last mapcount goes away on per small page basis without introducing new overhead for most common cases. [akpm@linux-foundation.org: fix typo in comment] [mhocko@suse.com: ignore partial THP when moving task] Signed-off-by: Kirill A. Shutemov Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Sasha Levin Cc: Aneesh Kumar K.V Cc: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 26 ++++++++++++++++++++++++-- include/linux/mm_types.h | 1 + include/linux/page-flags.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/rmap.h | 4 ++-- 4 files changed, 63 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 70f59de2e288..67e0fab225e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -410,6 +410,19 @@ static inline int is_vmalloc_or_module_addr(const void *x) extern void kvfree(const void *addr); +static inline atomic_t *compound_mapcount_ptr(struct page *page) +{ + return &page[1].compound_mapcount; +} + +static inline int compound_mapcount(struct page *page) +{ + if (!PageCompound(page)) + return 0; + page = compound_head(page); + return atomic_read(compound_mapcount_ptr(page)) + 1; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -422,8 +435,17 @@ static inline void page_mapcount_reset(struct page *page) static inline int page_mapcount(struct page *page) { + int ret; VM_BUG_ON_PAGE(PageSlab(page), page); - return atomic_read(&page->_mapcount) + 1; + + ret = atomic_read(&page->_mapcount) + 1; + if (PageCompound(page)) { + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + } + return ret; } static inline int page_count(struct page *page) @@ -934,7 +956,7 @@ static inline pgoff_t page_file_index(struct page *page) */ static inline int page_mapped(struct page *page) { - return atomic_read(&(page)->_mapcount) >= 0; + return atomic_read(&(page)->_mapcount) + compound_mapcount(page) >= 0; } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index faf6fe88d6b3..809defe0597d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -54,6 +54,7 @@ struct page { * see PAGE_MAPPING_ANON below. */ void *s_mem; /* slab first object */ + atomic_t compound_mapcount; /* first tail page */ }; /* Second double word */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0c42acca0338..19724e6ebd26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -126,6 +126,9 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, + + /* Compound pages. Stored in first tail page's flags */ + PG_double_map = PG_private_2, }; #ifndef __GENERATING_BOUNDS_H @@ -523,10 +526,43 @@ static inline int PageTransTail(struct page *page) return PageTail(page); } +/* + * PageDoubleMap indicates that the compound page is mapped with PTEs as well + * as PMDs. + * + * This is required for optimization of rmap operations for THP: we can postpone + * per small page mapcount accounting (and its overhead from atomic operations) + * until the first PMD split. + * + * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up + * by one. This reference will go away with last compound_mapcount. + * + * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). + */ +static inline int PageDoubleMap(struct page *page) +{ + return PageHead(page) && test_bit(PG_double_map, &page[1].flags); +} + +static inline int TestSetPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + return test_and_set_bit(PG_double_map, &page[1].flags); +} + +static inline int TestClearPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + return test_and_clear_bit(PG_double_map, &page[1].flags); +} + #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransTail) +TESTPAGEFLAG_FALSE(DoubleMap) + TESTSETFLAG_FALSE(DoubleMap) + TESTCLEARFLAG_FALSE(DoubleMap) #endif /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 038b6e704d9b..ebf3750e42b2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -183,9 +183,9 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -static inline void page_dup_rmap(struct page *page) +static inline void page_dup_rmap(struct page *page, bool compound) { - atomic_inc(&page->_mapcount); + atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } /* -- cgit v1.2.3 From e1534ae95004d6a307839a44eed40389d608c935 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:46 -0800 Subject: mm: differentiate page_mapped() from page_mapcount() for compound pages Let's define page_mapped() to be true for compound pages if any sub-pages of the compound page is mapped (with PMD or PTE). On other hand page_mapcount() return mapcount for this particular small page. This will make cases like page_get_anon_vma() behave correctly once we allow huge pages to be mapped with PTE. Most users outside core-mm should use page_mapcount() instead of page_mapped(). Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 67e0fab225e8..6b56cfd9fd09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -953,10 +953,21 @@ static inline pgoff_t page_file_index(struct page *page) /* * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. */ -static inline int page_mapped(struct page *page) +static inline bool page_mapped(struct page *page) { - return atomic_read(&(page)->_mapcount) + compound_mapcount(page) >= 0; + int i; + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; } /* -- cgit v1.2.3 From eef1b3ba053aa68967d294c80a50c4a26db30f52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:53 -0800 Subject: thp: implement split_huge_pmd() Original split_huge_page() combined two operations: splitting PMDs into tables of PTEs and splitting underlying compound page. This patch implements split_huge_pmd() which split given PMD without splitting other PMDs this page mapped with or underlying compound page. Without tail page refcounting, implementation of split_huge_pmd() is pretty straight-forward. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 333b058b1e3d..f1fa1c283be1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,7 +92,16 @@ extern unsigned long transparent_hugepage_flags; #define split_huge_page_to_list(page, list) BUILD_BUG() #define split_huge_page(page) BUILD_BUG() -#define split_huge_pmd(__vma, __pmd, __address) BUILD_BUG() + +void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address); + +#define split_huge_pmd(__vma, __pmd, __address) \ + do { \ + pmd_t *____pmd = (__pmd); \ + if (pmd_trans_huge(*____pmd)) \ + __split_huge_pmd(__vma, __pmd, __address); \ + } while (0) #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" -- cgit v1.2.3 From 4e41a30c6d506c884d3da9aeb316352e70679d4b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 15 Jan 2016 16:54:07 -0800 Subject: mm: hwpoison: adjust for new thp refcounting Some mm-related BUG_ON()s could trigger from hwpoison code due to recent changes in thp refcounting rule. This patch fixes them up. In the new refcounting, we no longer use tail->_mapcount to keep tail's refcount, and thereby we can simplify get/put_hwpoison_page(). And another change is that tail's refcount is not transferred to the raw page during thp split (more precisely, in new rule we don't take refcount on tail page any more.) So when we need thp split, we have to transfer the refcount properly to the 4kB soft-offlined page before migration. thp split code goes into core code only when precheck (total_mapcount(head) == page_count(head) - 1) passes to avoid useless split, where we assume that one refcount is held by the caller of thp split and the others are taken via mapping. To meet this assumption, this patch moves thp split part in soft_offline_page() after get_any_page(). [akpm@linux-foundation.org: remove unneeded #define, per Kirill] Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b56cfd9fd09..e4397f640e86 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2217,7 +2217,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int get_hwpoison_page(struct page *page); -extern void put_hwpoison_page(struct page *page); +#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From e9b61f19858a5d6c42ce2298cf138279375d0d9b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:10 -0800 Subject: thp: reintroduce split_huge_page() This patch adds implementation of split_huge_page() for new refcountings. Unlike previous implementation, new split_huge_page() can fail if somebody holds GUP pin on the page. It also means that pin on page would prevent it from bening split under you. It makes situation in many places much cleaner. The basic scheme of split_huge_page(): - Check that sum of mapcounts of all subpage is equal to page_count() plus one (caller pin). Foll off with -EBUSY. This way we can avoid useless PMD-splits. - Freeze the page counters by splitting all PMD and setup migration PTEs. - Re-check sum of mapcounts against page_count(). Page's counts are stable now. -EBUSY if page is pinned. - Split compound page. - Unfreeze the page by removing migration entries. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++-- include/linux/pagemap.h | 13 ++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f1fa1c283be1..90e11e6a37ab 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -90,8 +90,11 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; -#define split_huge_page_to_list(page, list) BUILD_BUG() -#define split_huge_page(page) BUILD_BUG() +int split_huge_page_to_list(struct page *page, struct list_head *list); +static inline int split_huge_page(struct page *page) +{ + return split_huge_page_to_list(page, NULL); +} void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index df214a4b886d..4d08b6c33557 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping, */ static inline pgoff_t page_to_pgoff(struct page *page) { + pgoff_t pgoff; + if (unlikely(PageHeadHuge(page))) return page->index << compound_order(page); - else + + if (likely(!PageTransTail(page))) return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + /* + * We don't initialize ->index for tail pages: calculate based on + * head page + */ + pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff += page - compound_head(page); + return pgoff; } /* -- cgit v1.2.3 From 9a982250f773cc8c76f1eee68a770b7cbf2faf78 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:17 -0800 Subject: thp: introduce deferred_split_huge_page() Currently we don't split huge page on partial unmap. It's not an ideal situation. It can lead to memory overhead. Furtunately, we can detect partial unmap on page_remove_rmap(). But we cannot call split_huge_page() from there due to locking context. It's also counterproductive to do directly from munmap() codepath: in many cases we will hit this from exit(2) and splitting the huge page just to free it up in small pages is not what we really want. The patch introduce deferred_split_huge_page() which put the huge page into queue for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. The page will be dropped from list on freeing through compound page destructor. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++++ include/linux/mm.h | 5 +++++ include/linux/mm_types.h | 2 ++ 3 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 90e11e6a37ab..7aec5ee9cfdf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -90,11 +90,15 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; +extern void prep_transhuge_page(struct page *page); +extern void free_transhuge_page(struct page *page); + int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } +void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address); @@ -170,6 +174,7 @@ static inline int split_huge_page(struct page *page) { return 0; } +static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, diff --git a/include/linux/mm.h b/include/linux/mm.h index e4397f640e86..aa8ae8330a75 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -507,6 +507,9 @@ enum compound_dtor_id { COMPOUND_PAGE_DTOR, #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + TRANSHUGE_PAGE_DTOR, #endif NR_COMPOUND_DTORS, }; @@ -537,6 +540,8 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +void free_compound_page(struct page *page); + #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 809defe0597d..2dd9c313a8c0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -55,6 +55,7 @@ struct page { */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ + /* page_deferred_list().next -- second tail page */ }; /* Second double word */ @@ -62,6 +63,7 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ + /* page_deferred_list().prev -- second tail page */ }; union { -- cgit v1.2.3 From b20ce5e03b936be077463015661dcf52be274e5b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:37 -0800 Subject: mm: prepare page_referenced() and page_idle to new THP refcounting Both page_referenced() and page_idle_clear_pte_refs_one() assume that THP can only be mapped with PMD, so there's no reason to look on PTEs for PageTransHuge() pages. That's no true anymore: THP can be mapped with PTEs too. The patch removes PageTransHuge() test from the functions and opencode page table check. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Vladimir Davydov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Sasha Levin Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 ----- include/linux/mm.h | 23 ++++++++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7aec5ee9cfdf..72cd942edb22 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -48,11 +48,6 @@ enum transparent_hugepage_flag { #endif }; -extern pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - spinlock_t **ptl); - #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<_mapcount, -1); } +int __page_mapcount(struct page *page); + static inline int page_mapcount(struct page *page) { - int ret; VM_BUG_ON_PAGE(PageSlab(page), page); - ret = atomic_read(&page->_mapcount) + 1; - if (PageCompound(page)) { - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - } - return ret; + if (unlikely(PageCompound(page))) + return __page_mapcount(page); + return atomic_read(&page->_mapcount) + 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int total_mapcount(struct page *page); +#else +static inline int total_mapcount(struct page *page) +{ + return page_mapcount(page); } +#endif static inline int page_count(struct page *page) { -- cgit v1.2.3 From 8749cfea11f3fffe8f7cad891470a77b36e0185f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 15 Jan 2016 16:54:45 -0800 Subject: mm: add page_check_address_transhuge() helper page_referenced_one() and page_idle_clear_pte_refs_one() duplicate the code for looking up pte of a (possibly transhuge) page. Move this code to a new helper function, page_check_address_transhuge(), and make the above mentioned functions use it. This is just a cleanup, no functional changes are intended. Signed-off-by: Vladimir Davydov Reviewed-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index ebf3750e42b2..77d1ba57d495 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -215,6 +215,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm, return ptep; } +/* + * Used by idle page tracking to check if a page was referenced via page + * tables. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, + unsigned long address, pmd_t **pmdp, + pte_t **ptep, spinlock_t **ptlp); +#else +static inline bool page_check_address_transhuge(struct page *page, + struct mm_struct *mm, unsigned long address, + pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp) +{ + *ptep = page_check_address(page, mm, address, ptlp, 0); + *pmdp = NULL; + return !!*ptep; +} +#endif + /* * Used by swapoff to help locate where page is expected in vma. */ -- cgit v1.2.3 From 854e9ed09dedf0c19ac8640e91bcc74bc3f9e5c9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:54:53 -0800 Subject: mm: support madvise(MADV_FREE) Linux doesn't have an ability to free pages lazy while other OS already have been supported that named by madvise(MADV_FREE). The gain is clear that kernel can discard freed pages rather than swapping out or OOM if memory pressure happens. Without memory pressure, freed pages would be reused by userspace without another additional overhead(ex, page fault + allocation + zeroing). Jason Evans said: : Facebook has been using MAP_UNINITIALIZED : (https://lkml.org/lkml/2012/1/18/308) in some of its applications for : several years, but there are operational costs to maintaining this : out-of-tree in our kernel and in jemalloc, and we are anxious to retire it : in favor of MADV_FREE. When we first enabled MAP_UNINITIALIZED it : increased throughput for much of our workload by ~5%, and although the : benefit has decreased using newer hardware and kernels, there is still : enough benefit that we cannot reasonably retire it without a replacement. : : Aside from Facebook operations, there are numerous broadly used : applications that would benefit from MADV_FREE. The ones that immediately : come to mind are redis, varnish, and MariaDB. I don't have much insight : into Android internals and development process, but I would hope to see : MADV_FREE support eventually end up there as well to benefit applications : linked with the integrated jemalloc. : : jemalloc will use MADV_FREE once it becomes available in the Linux kernel. : In fact, jemalloc already uses MADV_FREE or equivalent everywhere it's : available: *BSD, OS X, Windows, and Solaris -- every platform except Linux : (and AIX, but I'm not sure it even compiles on AIX). The lack of : MADV_FREE on Linux forced me down a long series of increasingly : sophisticated heuristics for madvise() volume reduction, and even so this : remains a common performance issue for people using jemalloc on Linux. : Please integrate MADV_FREE; many people will benefit substantially. How it works: When madvise syscall is called, VM clears dirty bit of ptes of the range. If memory pressure happens, VM checks dirty bit of page table and if it found still "clean", it means it's a "lazyfree pages" so VM could discard the page instead of swapping out. Once there was store operation for the page before VM peek a page to reclaim, dirty bit is set so VM can swap out the page instead of discarding. One thing we should notice is that basically, MADV_FREE relies on dirty bit in page table entry to decide whether VM allows to discard the page or not. IOW, if page table entry includes marked dirty bit, VM shouldn't discard the page. However, as a example, if swap-in by read fault happens, page table entry doesn't have dirty bit so MADV_FREE could discard the page wrongly. For avoiding the problem, MADV_FREE did more checks with PageDirty and PageSwapCache. It worked out because swapped-in page lives on swap cache and since it is evicted from the swap cache, the page has PG_dirty flag. So both page flags check effectively prevent wrong discarding by MADV_FREE. However, a problem in above logic is that swapped-in page has PG_dirty still after they are removed from swap cache so VM cannot consider the page as freeable any more even if madvise_free is called in future. Look at below example for detail. ptr = malloc(); memset(ptr); .. .. .. heavy memory pressure so all of pages are swapped out .. .. var = *ptr; -> a page swapped-in and could be removed from swapcache. Then, page table doesn't mark dirty bit and page descriptor includes PG_dirty .. .. madvise_free(ptr); -> It doesn't clear PG_dirty of the page. .. .. .. .. heavy memory pressure again. .. In this time, VM cannot discard the page because the page .. has *PG_dirty* To solve the problem, this patch clears PG_dirty if only the page is owned exclusively by current process when madvise is called because PG_dirty represents ptes's dirtiness in several processes so we could clear it only if we own it exclusively. Firstly, heavy users would be general allocators(ex, jemalloc, tcmalloc and hope glibc supports it) and jemalloc/tcmalloc already have supported the feature for other OS(ex, FreeBSD) barrios@blaptop:~/benchmark/ebizzy$ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 12 On-line CPU(s) list: 0-11 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 12 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 2 Stepping: 3 CPU MHz: 3200.185 BogoMIPS: 6400.53 Virtualization: VT-x Hypervisor vendor: KVM Virtualization type: full L1d cache: 32K L1i cache: 32K L2 cache: 4096K NUMA node0 CPU(s): 0-11 ebizzy benchmark(./ebizzy -S 10 -n 512) Higher avg is better. vanilla-jemalloc MADV_free-jemalloc 1 thread records: 10 records: 10 avg: 2961.90 avg: 12069.70 std: 71.96(2.43%) std: 186.68(1.55%) max: 3070.00 max: 12385.00 min: 2796.00 min: 11746.00 2 thread records: 10 records: 10 avg: 5020.00 avg: 17827.00 std: 264.87(5.28%) std: 358.52(2.01%) max: 5244.00 max: 18760.00 min: 4251.00 min: 17382.00 4 thread records: 10 records: 10 avg: 8988.80 avg: 27930.80 std: 1175.33(13.08%) std: 3317.33(11.88%) max: 9508.00 max: 30879.00 min: 5477.00 min: 21024.00 8 thread records: 10 records: 10 avg: 13036.50 avg: 33739.40 std: 170.67(1.31%) std: 5146.22(15.25%) max: 13371.00 max: 40572.00 min: 12785.00 min: 24088.00 16 thread records: 10 records: 10 avg: 11092.40 avg: 31424.20 std: 710.60(6.41%) std: 3763.89(11.98%) max: 12446.00 max: 36635.00 min: 9949.00 min: 25669.00 32 thread records: 10 records: 10 avg: 11067.00 avg: 34495.80 std: 971.06(8.77%) std: 2721.36(7.89%) max: 12010.00 max: 38598.00 min: 9002.00 min: 30636.00 In summary, MADV_FREE is about much faster than MADV_DONTNEED. This patch (of 12): Add core MADV_FREE implementation. [akpm@linux-foundation.org: small cleanups] Signed-off-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Hugh Dickins Cc: Mika Penttil Cc: Michael Kerrisk Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Jason Evans Cc: Daniel Micay Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andy Lutomirski Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: "Shaohua Li" Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Max Filippov Cc: Ralf Baechle Cc: Richard Henderson Cc: Roland Dreier Cc: Russell King Cc: Shaohua Li Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 ++ include/linux/vm_event_item.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 77d1ba57d495..bdf597c4f0be 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,6 +85,7 @@ enum ttu_flags { TTU_UNMAP = 1, /* unmap mode */ TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ + TTU_LZFREE = 8, /* lazy free mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -311,5 +312,6 @@ static inline int page_mkclean(struct page *page) #define SWAP_AGAIN 1 #define SWAP_FAIL 2 #define SWAP_MLOCK 3 +#define SWAP_LZFREE 4 #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e1f8c993e73b..67c1dbd19c6d 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, + PGLAZYFREED, FOR_ALL_ZONES(PGREFILL), FOR_ALL_ZONES(PGSTEAL_KSWAPD), FOR_ALL_ZONES(PGSTEAL_DIRECT), -- cgit v1.2.3 From 10853a039208c4afaa322a7d802456c8dca222f4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:11 -0800 Subject: mm: move lazily freed pages to inactive list MADV_FREE is a hint that it's okay to discard pages if there is memory pressure and we use reclaimers(ie, kswapd and direct reclaim) to free them so there is no value keeping them in the active anonymous LRU so this patch moves them to inactive LRU list's head. This means that MADV_FREE-ed pages which were living on the inactive list are reclaimed first because they are more likely to be cold rather than recently active pages. An arguable issue for the approach would be whether we should put the page to the head or tail of the inactive list. I chose head because the kernel cannot make sure it's really cold or warm for every MADV_FREE usecase but at least we know it's not *hot*, so landing of inactive head would be a comprimise for various usecases. This fixes suboptimal behavior of MADV_FREE when pages living on the active list will sit there for a long time even under memory pressure while the inactive list is reclaimed heavily. This basically breaks the whole purpose of using MADV_FREE to help the system to free memory which is might not be used. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Shaohua Li Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Jason Evans Cc: KOSAKI Motohiro Cc: Kirill A. Shutemov Cc: Matt Turner Cc: Max Filippov Cc: Michael Kerrisk Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Roland Dreier Cc: Russell King Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a282933c5bc6..414e101cd061 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,6 +307,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); -- cgit v1.2.3 From b8d3c4c3009d42869dc03a1da0efc2aa687d0ab4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:42 -0800 Subject: mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called We don't need to split THP page when MADV_FREE syscall is called if [start, len] is aligned with THP size. The split could be done when VM decide to free it in reclaim path if memory pressure is heavy. With that, we could avoid unnecessary THP split. For the feature, this patch changes pte dirtness marking logic of THP. Now, it marks every ptes of pages dirty unconditionally in splitting, which makes MADV_FREE void. So, instead, this patch propagates pmd dirtiness to all pages via PG_dirty and restores pte dirtiness from PG_dirty. With this, if pmd is clean(ie, MADV_FREEed) when split happens(e,g, shrink_page_list), all of pages are clean too so we could discard them. Signed-off-by: Minchan Kim Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Jason Evans Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Matt Turner Cc: Max Filippov Cc: Mel Gorman Cc: Michael Kerrisk Cc: Michal Hocko Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Rik van Riel Cc: Roland Dreier Cc: Russell King Cc: Shaohua Li Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 72cd942edb22..0160201993d4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +extern int madvise_free_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); -- cgit v1.2.3 From b2e0d1625e193b40cbbd45b799f82d54d34e015c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:55:59 -0800 Subject: dax: fix lifetime of in-kernel dax mappings with dax_map_atomic() The DAX implementation needs to protect new calls to ->direct_access() and usage of its return value against the driver for the underlying block device being disabled. Use blk_queue_enter()/blk_queue_exit() to hold off blk_cleanup_queue() from proceeding, or otherwise fail new mapping requests if the request_queue is being torn down. This also introduces blk_dax_ctl to simplify the interface from fs/dax.c through dax_map_atomic() to bdev_direct_access(). [willy@linux.intel.com: fix read() of a hole] Signed-off-by: Dan Williams Reviewed-by: Jeff Moyer Cc: Jan Kara Cc: Jens Axboe Cc: Dave Chinner Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..88821fa26f19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1617,6 +1617,20 @@ static inline bool integrity_req_gap_front_merge(struct request *req, #endif /* CONFIG_BLK_DEV_INTEGRITY */ +/** + * struct blk_dax_ctl - control and output parameters for ->direct_access + * @sector: (input) offset relative to a block_device + * @addr: (output) kernel virtual address for @sector populated by driver + * @pfn: (output) page frame number for @addr populated by driver + * @size: (input) number of bytes requested + */ +struct blk_dax_ctl { + sector_t sector; + void __pmem *addr; + long size; + unsigned long pfn; +}; + struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); @@ -1643,8 +1657,7 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -extern long bdev_direct_access(struct block_device *, sector_t, - void __pmem **addr, unsigned long *pfn, long size); +extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); #else /* CONFIG_BLOCK */ struct block_device; -- cgit v1.2.3 From ba049e93aef7e8c571567088b1b73f4f5b99272a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:11 -0800 Subject: kvm: rename pfn_t to kvm_pfn_t To date, we have implemented two I/O usage models for persistent memory, PMEM (a persistent "ram disk") and DAX (mmap persistent memory into userspace). This series adds a third, DAX-GUP, that allows DAX mappings to be the target of direct-i/o. It allows userspace to coordinate DMA/RDMA from/to persistent memory. The implementation leverages the ZONE_DEVICE mm-zone that went into 4.3-rc1 (also discussed at kernel summit) to flag pages that are owned and dynamically mapped by a device driver. The pmem driver, after mapping a persistent memory range into the system memmap via devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus page-backed pmem-pfns via flags in the new pfn_t type. The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the resulting pte(s) inserted into the process page tables with a new _PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys off _PAGE_DEVMAP to pin the device hosting the page range active. Finally, get_page() and put_page() are modified to take references against the device driver established page mapping. Finally, this need for "struct page" for persistent memory requires memory capacity to store the memmap array. Given the memmap array for a large pool of persistent may exhaust available DRAM introduce a mechanism to allocate the memmap from persistent memory. The new "struct vmem_altmap *" parameter to devm_memremap_pages() enables arch_add_memory() to use reserved pmem capacity rather than the page allocator. This patch (of 18): The core has developed a need for a "pfn_t" type [1]. Move the existing pfn_t in KVM to kvm_pfn_t [2]. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html Signed-off-by: Dan Williams Acked-by: Christoffer Dall Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kvm_host.h | 37 +++++++++++++++++++------------------ include/linux/kvm_types.h | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f707f74055c3..861f690aa791 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -66,7 +66,7 @@ * error pfns indicate that the gfn is in slot but faild to * translate it to pfn on host. */ -static inline bool is_error_pfn(pfn_t pfn) +static inline bool is_error_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_MASK); } @@ -76,13 +76,13 @@ static inline bool is_error_pfn(pfn_t pfn) * translated to pfn - it is not in slot or failed to * translate it to pfn. */ -static inline bool is_error_noslot_pfn(pfn_t pfn) +static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); } /* noslot pfn indicates that the gfn is not in slot. */ -static inline bool is_noslot_pfn(pfn_t pfn) +static inline bool is_noslot_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_NOSLOT; } @@ -591,19 +591,20 @@ void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, +kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); +kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); -pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); -pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); -pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, - bool *async, bool write_fault, bool *writable); +kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); +kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); +kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, + bool atomic, bool *async, bool write_fault, + bool *writable); -void kvm_release_pfn_clean(pfn_t pfn); -void kvm_set_pfn_dirty(pfn_t pfn); -void kvm_set_pfn_accessed(pfn_t pfn); -void kvm_get_pfn(pfn_t pfn); +void kvm_release_pfn_clean(kvm_pfn_t pfn); +void kvm_set_pfn_dirty(kvm_pfn_t pfn); +void kvm_set_pfn_accessed(kvm_pfn_t pfn); +void kvm_get_pfn(kvm_pfn_t pfn); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); @@ -629,8 +630,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); -pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); +kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); @@ -811,7 +812,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_is_reserved_pfn(pfn_t pfn); +bool kvm_is_reserved_pfn(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; @@ -965,7 +966,7 @@ static inline gfn_t gpa_to_gfn(gpa_t gpa) return (gfn_t)(gpa >> PAGE_SHIFT); } -static inline hpa_t pfn_to_hpa(pfn_t pfn) +static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; } diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 1b47a185c2f0..8bf259dae9f6 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -53,7 +53,7 @@ typedef unsigned long hva_t; typedef u64 hpa_t; typedef u64 hfn_t; -typedef hfn_t pfn_t; +typedef hfn_t kvm_pfn_t; struct gfn_to_hva_cache { u64 generation; -- cgit v1.2.3 From 34c0fd540e79fb49ef9ce864dae1058cca265780 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:14 -0800 Subject: mm, dax, pmem: introduce pfn_t For the purpose of communicating the optional presence of a 'struct page' for the pfn returned from ->direct_access(), introduce a type that encapsulates a page-frame-number plus flags. These flags contain the historical "page_link" encoding for a scatterlist entry, but can also denote "device memory". Where "device memory" is a set of pfns that are not part of the kernel's linear mapping by default, but are accessed via the same memory controller as ram. The motivation for this new type is large capacity persistent memory that needs struct page entries in the 'memmap' to support 3rd party DMA (i.e. O_DIRECT I/O with a persistent memory source/target). However, we also need it in support of maintaining a list of mapped inodes which need to be unmapped at driver teardown or freeze_bdev() time. Signed-off-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Hansen Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 5 ++-- include/linux/pfn.h | 9 +++++++ include/linux/pfn_t.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 include/linux/pfn_t.h (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 88821fa26f19..bfb64d672e19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1628,7 +1629,7 @@ struct blk_dax_ctl { sector_t sector; void __pmem *addr; long size; - unsigned long pfn; + pfn_t pfn; }; struct block_device_operations { @@ -1638,7 +1639,7 @@ struct block_device_operations { int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void __pmem **, - unsigned long *pfn); + pfn_t *); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 97f3e88aead4..2d8e49711b63 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -3,6 +3,15 @@ #ifndef __ASSEMBLY__ #include + +/* + * pfn_t: encapsulates a page-frame number that is optionally backed + * by memmap (struct page). Whether a pfn_t has a 'struct page' + * backing is indicated by flags in the high bits of the value. + */ +typedef struct { + unsigned long val; +} pfn_t; #endif #define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h new file mode 100644 index 000000000000..c557a0e0b20c --- /dev/null +++ b/include/linux/pfn_t.h @@ -0,0 +1,67 @@ +#ifndef _LINUX_PFN_T_H_ +#define _LINUX_PFN_T_H_ +#include + +/* + * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags + * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry + * PFN_SG_LAST - pfn references a page and is the last scatterlist entry + * PFN_DEV - pfn is not covered by system memmap by default + * PFN_MAP - pfn has a dynamic page mapping established by a device driver + */ +#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ + << (BITS_PER_LONG - PAGE_SHIFT)) +#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) +#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) +#define PFN_DEV (1UL << (BITS_PER_LONG - 3)) +#define PFN_MAP (1UL << (BITS_PER_LONG - 4)) + +static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) +{ + pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; + + return pfn_t; +} + +/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */ +static inline pfn_t pfn_to_pfn_t(unsigned long pfn) +{ + return __pfn_to_pfn_t(pfn, 0); +} + +extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); + +static inline bool pfn_t_has_page(pfn_t pfn) +{ + return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0; +} + +static inline unsigned long pfn_t_to_pfn(pfn_t pfn) +{ + return pfn.val & ~PFN_FLAGS_MASK; +} + +static inline struct page *pfn_t_to_page(pfn_t pfn) +{ + if (pfn_t_has_page(pfn)) + return pfn_to_page(pfn_t_to_pfn(pfn)); + return NULL; +} + +static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +{ + return PFN_PHYS(pfn_t_to_pfn(pfn)); +} + +static inline void *pfn_t_to_virt(pfn_t pfn) +{ + if (pfn_t_has_page(pfn)) + return __va(pfn_t_to_phys(pfn)); + return NULL; +} + +static inline pfn_t page_to_pfn_t(struct page *page) +{ + return pfn_to_pfn_t(page_to_pfn(page)); +} +#endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From 260ae3f7db614a5c4aa4b773599f99adc1d9859e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:17 -0800 Subject: mm: skip memory block registration for ZONE_DEVICE Prevent userspace from trying and failing to online ZONE_DEVICE pages which are meant to never be onlined. For example on platforms with a udev rule like the following: SUBSYSTEM=="memory", ACTION=="add", ATTR{state}=="offline", ATTR{state}="online" ...will generate futile attempts to online the ZONE_DEVICE sections. Example kernel messages: Built 1 zonelists in Node order, mobility grouping on. Total pages: 1004747 Policy zone: Normal online_pages [mem 0x248000000-0x24fffffff] failed Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ef5f21735af..d9fe12d45c21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -674,6 +674,18 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_zone_device_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_DEVICE; +} +#else +static inline bool is_zone_device_page(const struct page *page) +{ + return false; +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif -- cgit v1.2.3 From 9476df7d80dfc425b37bfecf1d89edf8ec81fcb6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:19 -0800 Subject: mm: introduce find_dev_pagemap() There are several scenarios where we need to retrieve and update metadata associated with a given devm_memremap_pages() mapping, and the only lookup key available is a pfn in the range: 1/ We want to augment vmemmap_populate() (called via arch_add_memory()) to allocate memmap storage from pre-allocated pages reserved by the device driver. At vmemmap_alloc_block_buf() time it grabs device pages rather than page allocator pages. This is in support of devm_memremap_pages() mappings where the memmap is too large to fit in main memory (i.e. large persistent memory devices). 2/ Taking a reference against the mapping when inserting device pages into the address_space radix of a given inode. This facilitates unmap_mapping_range() and truncate_inode_pages() operations when the driver is tearing down the mapping. 3/ get_user_pages() operations on ZONE_DEVICE memory require taking a reference against the mapping so that the driver teardown path can revoke and drain usage of device pages. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 15 --------------- include/linux/memremap.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 15 deletions(-) create mode 100644 include/linux/memremap.h (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index de64c1e53612..fffd88d7f426 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr); void *__devm_memremap_pages(struct device *dev, struct resource *res); -#ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); -#else -static inline void *devm_memremap_pages(struct device *dev, struct resource *res) -{ - /* - * Fail attempts to call devm_memremap_pages() without - * ZONE_DEVICE support enabled, this requires callers to fall - * back to plain devm_memremap() based on config - */ - WARN_ON_ONCE(1); - return ERR_PTR(-ENXIO); -} -#endif - /* * Some systems do not have legacy ISA devices. * /dev/port is not a valid interface on these systems. diff --git a/include/linux/memremap.h b/include/linux/memremap.h new file mode 100644 index 000000000000..d90721c178bb --- /dev/null +++ b/include/linux/memremap.h @@ -0,0 +1,38 @@ +#ifndef _LINUX_MEMREMAP_H_ +#define _LINUX_MEMREMAP_H_ +#include + +struct resource; +struct device; +/** + * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @dev: host device of the mapping for debug + */ +struct dev_pagemap { + /* TODO: vmem_altmap and percpu_ref count */ + struct device *dev; +}; + +#ifdef CONFIG_ZONE_DEVICE +void *devm_memremap_pages(struct device *dev, struct resource *res); +struct dev_pagemap *find_dev_pagemap(resource_size_t phys); +#else +static inline void *devm_memremap_pages(struct device *dev, + struct resource *res) +{ + /* + * Fail attempts to call devm_memremap_pages() without + * ZONE_DEVICE support enabled, this requires callers to fall + * back to plain devm_memremap() based on config + */ + WARN_ON_ONCE(1); + return ERR_PTR(-ENXIO); +} + +static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + return NULL; +} +#endif + +#endif /* _LINUX_MEMREMAP_H_ */ -- cgit v1.2.3 From 4b94ffdc4163bae1ec73b6e977ffb7a7da3d06d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:22 -0800 Subject: x86, mm: introduce vmem_altmap to augment vmemmap_populate() In support of providing struct page for large persistent memory capacities, use struct vmem_altmap to change the default policy for allocating memory for the memmap array. The default vmemmap_populate() allocates page table storage area from the page allocator. Given persistent memory capacities relative to DRAM it may not be feasible to store the memmap in 'System Memory'. Instead vmem_altmap represents pre-allocated "device pages" to satisfy vmemmap_alloc_block_buf() requests. Signed-off-by: Dan Williams Reported-by: kbuild test robot Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- include/linux/memremap.h | 39 +++++++++++++++++++++++++++++++++++---- include/linux/mm.h | 9 ++++++++- 3 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2ea574ff9714..43405992d027 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); +extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d90721c178bb..aa3e82a80d7b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -4,21 +4,53 @@ struct resource; struct device; + +/** + * struct vmem_altmap - pre-allocated storage for vmemmap_populate + * @base_pfn: base of the entire dev_pagemap mapping + * @reserve: pages mapped, but reserved for driver use (relative to @base) + * @free: free pages set aside in the mapping for memmap storage + * @align: pages reserved to meet allocation alignments + * @alloc: track pages consumed, private to vmemmap_populate() + */ +struct vmem_altmap { + const unsigned long base_pfn; + const unsigned long reserve; + unsigned long free; + unsigned long align; + unsigned long alloc; +}; + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); +#else +static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + return NULL; +} +#endif + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @altmap: pre-allocated/reserved memory for vmemmap allocations * @dev: host device of the mapping for debug */ struct dev_pagemap { - /* TODO: vmem_altmap and percpu_ref count */ + struct vmem_altmap *altmap; + const struct resource *res; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res) + struct resource *res, struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -34,5 +66,4 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif - #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index d9fe12d45c21..8bb0907a3603 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2217,7 +2217,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); void *vmemmap_alloc_block(unsigned long size, int node); -void *vmemmap_alloc_block_buf(unsigned long size, int node); +struct vmem_altmap; +void *__vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); +static inline void *vmemmap_alloc_block_buf(unsigned long size, int node) +{ + return __vmemmap_alloc_block_buf(size, node, NULL); +} + void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node); -- cgit v1.2.3 From 888cdbc2c9a76a0e450f533b1957cdbfe7d483d5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:32 -0800 Subject: hugetlb: fix compile error on tile Inlude asm/pgtable.h to get the definition for pud_t to fix: include/linux/hugetlb.h:203:29: error: unknown type name 'pud_t' Signed-off-by: Dan Williams Cc: Liviu Dudau Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e76574d8f9b5..7d953c2542a8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -8,6 +8,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; -- cgit v1.2.3 From 01c8f1c44b83a0825b573e7c723b033cece37b86 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:40 -0800 Subject: mm, dax, gpu: convert vm_insert_mixed to pfn_t Convert the raw unsigned long 'pfn' argument to pfn_t for the purpose of evaluating the PFN_MAP and PFN_DEV flags. When both are set it triggers _PAGE_DEVMAP to be set in the resulting pte. There are no functional changes to the gpu drivers as a result of this conversion. Signed-off-by: Dan Williams Cc: Dave Hansen Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/pfn_t.h | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bb0907a3603..a9902152449f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2107,7 +2107,7 @@ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn); + pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index c557a0e0b20c..bdaa275d7623 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -64,4 +64,31 @@ static inline pfn_t page_to_pfn_t(struct page *page) { return pfn_to_pfn_t(page_to_pfn(page)); } + +static inline int pfn_t_valid(pfn_t pfn) +{ + return pfn_valid(pfn_t_to_pfn(pfn)); +} + +#ifdef CONFIG_MMU +static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pte(pfn_t_to_pfn(pfn), pgprot); +} +#endif + +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline bool pfn_t_devmap(pfn_t pfn) +{ + const unsigned long flags = PFN_DEV|PFN_MAP; + + return (pfn.val & flags) == flags; +} +#else +static inline bool pfn_t_devmap(pfn_t pfn) +{ + return false; +} +pte_t pte_mkdevmap(pte_t pte); +#endif #endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From f25748e3c34eb8bb54853e9adba2d3dcf030503c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:43 -0800 Subject: mm, dax: convert vmf_insert_pfn_pmd() to pfn_t Similar to the conversion of vm_insert_mixed() use pfn_t in the vmf_insert_pfn_pmd() to tag the resulting pte with _PAGE_DEVICE when the pfn is backed by a devm_memremap_pages() mapping. Signed-off-by: Dan Williams Cc: Dave Hansen Cc: Matthew Wilcox Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- include/linux/pfn_t.h | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0160201993d4..8ca35a131904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -37,7 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, - unsigned long pfn, bool write); + pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index bdaa275d7623..0703b5360d31 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -77,6 +77,13 @@ static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot) } #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); +} +#endif + #ifdef __HAVE_ARCH_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { @@ -90,5 +97,6 @@ static inline bool pfn_t_devmap(pfn_t pfn) return false; } pte_t pte_mkdevmap(pte_t pte); +pmd_t pmd_mkdevmap(pmd_t pmd); #endif #endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From 5c2c2587b13235bf8b5c9027589f22eff68bdf49 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:49 -0800 Subject: mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 11 +++++++++++ include/linux/memremap.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/mm_types.h | 5 +++++ 3 files changed, 63 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 5356f4d661a7..30cf4200ab40 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry); extern void list_del(struct list_head *entry); #endif +#ifdef CONFIG_DEBUG_LIST +/* + * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one + * of the pages it allocates is ever passed to list_add() + */ +extern void list_force_poison(struct list_head *entry); +#else +/* fallback to the less strict LIST_POISON* definitions */ +#define list_force_poison list_del +#endif + /** * list_replace - replace old entry by new one * @old : the element to be replaced diff --git a/include/linux/memremap.h b/include/linux/memremap.h index aa3e82a80d7b..bcaa634139a9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,8 @@ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include +#include +#include struct resource; struct device; @@ -36,21 +38,25 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations + * @res: physical address range covered by @ref + * @ref: reference count that pins the devm_memremap_pages() mapping * @dev: host device of the mapping for debug */ struct dev_pagemap { struct vmem_altmap *altmap; const struct resource *res; + struct percpu_ref *ref; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct resource *res, - struct vmem_altmap *altmap); + struct percpu_ref *ref, struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res, struct vmem_altmap *altmap) + struct resource *res, struct percpu_ref *ref, + struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -66,4 +72,43 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif + +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the + * same mapping. + */ +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) +{ + const struct resource *res = pgmap ? pgmap->res : NULL; + resource_size_t phys = PFN_PHYS(pfn); + + /* + * In the cached case we're already holding a live reference so + * we can simply do a blind increment + */ + if (res && phys >= res->start && phys <= res->end) { + percpu_ref_get(pgmap->ref); + return pgmap; + } + + /* fall back to slow path lookup */ + rcu_read_lock(); + pgmap = find_dev_pagemap(phys); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; + rcu_read_unlock(); + + return pgmap; +} + +static inline void put_dev_pagemap(struct dev_pagemap *pgmap) +{ + if (pgmap) + percpu_ref_put(pgmap->ref); +} #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2dd9c313a8c0..d3ebb9d21a53 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -116,6 +116,11 @@ struct page { * Can be used as a generic list * by the page owner. */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ struct { /* slub per cpu partial pages */ struct page *next; /* Next partial slab */ #ifdef CONFIG_64BIT -- cgit v1.2.3 From 5c7fb56e5e3f7035dd798a8e1adee639f87043e5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:52 -0800 Subject: mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd A dax-huge-page mapping while it uses some thp helpers is ultimately not a transparent huge page. The distinction is especially important in the get_user_pages() path. pmd_devmap() is used to distinguish dax-pmds from pmd_huge() and pmd_trans_huge() which have slightly different semantics. Explicitly mark the pmd_trans_huge() helpers that dax needs by adding pmd_devmap() checks. [kirill.shutemov@linux.intel.com: fix regression in handling mlocked pages in __split_huge_pmd()] Signed-off-by: Dan Williams Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Matthew Wilcox Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++-- include/linux/mm.h | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8ca35a131904..d39fa60bd6bf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,7 +104,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (pmd_trans_huge(*____pmd)) \ + if (pmd_trans_huge(*____pmd) \ + || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address); \ } while (0) @@ -124,7 +125,7 @@ static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); - if (pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma, ptl); else return false; diff --git a/include/linux/mm.h b/include/linux/mm.h index a9902152449f..cd123272d28d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -329,6 +329,13 @@ struct inode; #define page_private(page) ((page)->private) #define set_page_private(page, v) ((page)->private = (v)) +#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) +static inline int pmd_devmap(pmd_t pmd) +{ + return 0; +} +#endif + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) -- cgit v1.2.3 From 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:55 -0800 Subject: mm, x86: get_user_pages() for dax mappings A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 10 ++++++++- include/linux/mm.h | 59 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d39fa60bd6bf..cfe81e10bd54 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, pfn_t pfn, bool write); - enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, @@ -55,6 +54,9 @@ enum transparent_hugepage_flag { #define HPAGE_PMD_NR (1< #include #include +#include #include #include #include @@ -465,17 +466,6 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline void get_page(struct page *page) -{ - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); @@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page) void __put_page(struct page *page); -static inline void put_page(struct page *page) -{ - page = compound_head(page); - if (put_page_testzero(page)) - __put_page(page); -} - void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); @@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE +void get_zone_device_page(struct page *page); +void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else +static inline void get_zone_device_page(struct page *page) +{ +} +static inline void put_zone_device_page(struct page *page) +{ +} static inline bool is_zone_device_page(const struct page *page) { return false; } #endif +static inline void get_page(struct page *page) +{ + page = compound_head(page); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); + atomic_inc(&page->_count); + + if (unlikely(is_zone_device_page(page))) + get_zone_device_page(page); +} + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + + if (put_page_testzero(page)) + __put_page(page); + + if (unlikely(is_zone_device_page(page))) + put_zone_device_page(page); +} + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif +#ifndef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t pte) +{ + return 0; +} +#endif + int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.3 From 4a9e1cda274893eca7d178d7dc265503ccb9d87a Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 15 Jan 2016 16:57:04 -0800 Subject: mm: bring in additional flag for fixup_user_fault to signal unlock During Jason's work with postcopy migration support for s390 a problem regarding gmap faults was discovered. The gmap code will call fixup_user_fault which will end up always in handle_mm_fault. Till now we never cared about retries, but as the userfaultfd code kind of relies on it. this needs some fix. This patchset does not take care of the futex code. I will now look closer at this. This patch (of 2): With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we ever unlocked mmap_sem. This patch brings in the logic to handle retries as well as it cleans up the current documentation. fixup_user_fault was not having the same semantics as filemap_fault. It never indicated if a retry happened and so a caller wasn't able to handle that case. So we now changed the behaviour to always retry a locked mmap_sem. Signed-off-by: Dominik Dingel Reviewed-by: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Christian Borntraeger Cc: "Jason J. Herne" Cc: David Rientjes Cc: Eric B Munson Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Heiko Carstens Cc: Dominik Dingel Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 792f2469c142..1d6ec55d8b25 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1194,7 +1194,8 @@ int invalidate_inode_page(struct page *page); extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags); + unsigned long address, unsigned int fault_flags, + bool *unlocked); #else static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1206,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm, } static inline int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, - unsigned int fault_flags) + unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); -- cgit v1.2.3 From 036fbb21de7c74d5637bf41110c47005363f3000 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:57:11 -0800 Subject: memblock: fix section mismatch allmodconfig produces following warning for me: WARNING: vmlinux.o(.text.unlikely+0x10314): Section mismatch in reference from the function movable_node_is_enabled() to the variable .meminit.data:movable_node_enabled The function movable_node_is_enabled() references the variable __meminitdata movable_node_enabled. This is often because movable_node_is_enabled lacks a __meminitdata annotation or the annotation of movable_node_enabled is wrong. Let's mark the function with __meminit. It fixes the warning. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 173fb44e22f1..3106ac1c895e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -61,6 +61,14 @@ extern int memblock_debug; extern bool movable_node_enabled; #endif /* CONFIG_MOVABLE_NODE */ +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#define __init_memblock __meminit +#define __initdata_memblock __meminitdata +#else +#define __init_memblock +#define __initdata_memblock +#endif + #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) @@ -166,7 +174,7 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m) return m->flags & MEMBLOCK_HOTPLUG; } -static inline bool movable_node_is_enabled(void) +static inline bool __init_memblock movable_node_is_enabled(void) { return movable_node_enabled; } @@ -405,14 +413,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo for (idx = 0; idx < memblock_type->cnt; \ idx++,rgn = &memblock_type->regions[idx]) -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -#define __init_memblock __meminit -#define __initdata_memblock __meminitdata -#else -#define __init_memblock -#define __initdata_memblock -#endif - #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); #else -- cgit v1.2.3 From 7f43add451d2a0d235074b72d254ae266a6a023f Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Fri, 15 Jan 2016 16:57:22 -0800 Subject: mm/mlock.c: change can_do_mlock return value type to boolean Since can_do_mlock only return 1 or 0, so make it boolean. No functional change. [akpm@linux-foundation.org: update declaration in mm.h] Signed-off-by: Wang Xiaoqiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d6ec55d8b25..f1cd22f2df1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1100,7 +1100,7 @@ static inline bool shmem_mapping(struct address_space *mapping) } #endif -extern int can_do_mlock(void); +extern bool can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); -- cgit v1.2.3 From b8a0255db958b8f70c5267dda152e93b6fda1778 Mon Sep 17 00:00:00 2001 From: Vasily Kulikov Date: Fri, 15 Jan 2016 16:57:55 -0800 Subject: include/linux/poison.h: use POISON_POINTER_DELTA for poison pointers TIMER_ENTRY_STATIC and TAIL_MAPPING are defined as poison pointers which should point to nowhere. Redefine them using POISON_POINTER_DELTA arithmetics to make sure they really point to non-mappable area declared by the target architecture. Signed-off-by: Vasily Kulikov Acked-by: Thomas Gleixner Cc: Solar Designer Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 76c3b6c38c16..4a27153574e2 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -27,14 +27,14 @@ * Magic number "tsta" to indicate a static timer initializer * for the object debugging code. */ -#define TIMER_ENTRY_STATIC ((void *) 0x74737461) +#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) /********** mm/debug-pagealloc.c **********/ #define PAGE_POISON 0xaa /********** mm/page_alloc.c ************/ -#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA) +#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA) /********** mm/slab.c **********/ /* -- cgit v1.2.3 From 8f57e4d930d48217268315898212518d4d3e0773 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Fri, 15 Jan 2016 16:57:58 -0800 Subject: include/linux/kernel.h: change abs() macro so it uses consistent return type Rewrite abs() so that its return type does not depend on the architecture and no unexpected type conversion happen inside of it. The only conversion is from unsigned to signed type. char is left as a return type but treated as a signed type regradless of it's actual signedness. With the old version, int arguments were promoted to long and depending on architecture a long argument might result in s64 or long return type (which may or may not be the same). This came after some back and forth with Nicolas. The current macro has different return type (for the same input type) depending on architecture which might be midly iritating. An alternative version would promote to int like so: #define abs(x) __abs_choose_expr(x, long long, \ __abs_choose_expr(x, long, \ __builtin_choose_expr( \ sizeof(x) <= sizeof(int), \ ({ int __x = (x); __x<0?-__x:__x; }), \ ((void)0)))) I have no preference but imagine Linus might. :] Nicolas argument against is that promoting to int causes iconsistent behaviour: int main(void) { unsigned short a = 0, b = 1, c = a - b; unsigned short d = abs(a - b); unsigned short e = abs(c); printf("%u %u\n", d, e); // prints: 1 65535 } Then again, no sane person expects consistent behaviour from C integer arithmetic. ;) Note: __builtin_types_compatible_p(unsigned char, char) is always false, and __builtin_types_compatible_p(signed char, char) is also always false. Signed-off-by: Michal Nazarewicz Reviewed-by: Nicolas Pitre Cc: Srinivas Pandruvada Cc: Wey-Yi Guy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7311c3294e25..f31638c6e873 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -202,26 +202,26 @@ extern int _cond_resched(void); /** * abs - return absolute value of an argument - * @x: the value. If it is unsigned type, it is converted to signed type first - * (s64, long or int depending on its size). + * @x: the value. If it is unsigned type, it is converted to signed type first. + * char is treated as if it was signed (regardless of whether it really is) + * but the macro's return type is preserved as char. * - * Return: an absolute value of x. If x is 64-bit, macro's return type is s64, - * otherwise it is signed long. + * Return: an absolute value of x. */ -#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({ \ - s64 __x = (x); \ - (__x < 0) ? -__x : __x; \ - }), ({ \ - long ret; \ - if (sizeof(x) == sizeof(long)) { \ - long __x = (x); \ - ret = (__x < 0) ? -__x : __x; \ - } else { \ - int __x = (x); \ - ret = (__x < 0) ? -__x : __x; \ - } \ - ret; \ - })) +#define abs(x) __abs_choose_expr(x, long long, \ + __abs_choose_expr(x, long, \ + __abs_choose_expr(x, int, \ + __abs_choose_expr(x, short, \ + __abs_choose_expr(x, char, \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), char), \ + (char)({ signed char __x = (x); __x<0?-__x:__x; }), \ + ((void)0))))))) + +#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), signed type) || \ + __builtin_types_compatible_p(typeof(x), unsigned type), \ + ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other) /** * reciprocal_scale - "scale" a value into range [0, ep_ro) -- cgit v1.2.3 From 3f1bfd94136ecb85889e6e22893c08e8a9c697c2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Fri, 15 Jan 2016 16:58:04 -0800 Subject: include/linux/kdev_t.h: remove new_valid_dev() As all new_valid_dev() checks have been removed it's time to drop new_valid_dev() itself. No functional change. Signed-off-by: Yaowei Bai Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdev_t.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h index 052c7b32cc91..8e9e288b08c1 100644 --- a/include/linux/kdev_t.h +++ b/include/linux/kdev_t.h @@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val) return MKDEV((val >> 8) & 255, val & 255); } -static inline bool new_valid_dev(dev_t dev) -{ - return 1; -} - static inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); -- cgit v1.2.3 From dfffa587a6bcd84f2087f88e11600b0e8b0aa1ee Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 15 Jan 2016 16:58:10 -0800 Subject: err.h: add (missing) unlikely() to IS_ERR_OR_NULL() IS_ERR_VALUE() already contains it and so we need to add this only to the !ptr check. That will allow users of IS_ERR_OR_NULL(), to not add this compiler flag. Signed-off-by: Viresh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/err.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/err.h b/include/linux/err.h index a729120644d5..56762ab41713 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -37,7 +37,7 @@ static inline bool __must_check IS_ERR(__force const void *ptr) static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { - return !ptr || IS_ERR_VALUE((unsigned long)ptr); + return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** -- cgit v1.2.3 From 8d91f8b15361dfb438ab6eb3b319e2ded43458ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Jan 2016 16:58:24 -0800 Subject: printk: do cond_resched() between lines while outputting to consoles @console_may_schedule tracks whether console_sem was acquired through lock or trylock. If the former, we're inside a sleepable context and console_conditional_schedule() performs cond_resched(). This allows console drivers which use console_lock for synchronization to yield while performing time-consuming operations such as scrolling. However, the actual console outputting is performed while holding irq-safe logbuf_lock, so console_unlock() clears @console_may_schedule before starting outputting lines. Also, only a few drivers call console_conditional_schedule() to begin with. This means that when a lot of lines need to be output by console_unlock(), for example on a console registration, the task doing console_unlock() may not yield for a long time on a non-preemptible kernel. If this happens with a slow console devices, for example a serial console, the outputting task may occupy the cpu for a very long time. Long enough to trigger softlockup and/or RCU stall warnings, which in turn pile more messages, sometimes enough to trigger the next cycle of warnings incapacitating the system. Fix it by making console_unlock() insert cond_resched() between lines if @console_may_schedule. Signed-off-by: Tejun Heo Reported-by: Calvin Owens Acked-by: Jan Kara Cc: Dave Jones Cc: Kyle McMartin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index bd194343c346..ea731af2451e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -150,6 +150,7 @@ extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); +extern void console_flush_on_panic(void); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); -- cgit v1.2.3 From fe22cd9b7c980b8b948ec85f034a8668c57ec867 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Fri, 15 Jan 2016 16:59:12 -0800 Subject: printk: help pr_debug and pr_devel to optimize out arguments Currently, pr_debug and pr_devel will not elide function call arguments appearing in calls to the no_printk for these macros. This is because all side effects must be honored before proceeding to the 0-value assignment in no_printk. The behavior is contrary to documentation found in the CodingStyle and the header file where these functions are declared. This patch corrects that behavior by shunting out the call to no_printk completely. The format string is still checked by gcc for correctness, but no code seems to be emitted in common cases. [akpm@linux-foundation.org: remove braces, per Joe] Fixes: 5264f2f75d86 ("include/linux/printk.h: use and neaten no_printk") Signed-off-by: Aaron Conole Reported-by: Dmitry Vyukov Cc: Joe Perches Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9729565c25ff..9ccbdf2c1453 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -106,13 +106,13 @@ struct va_format { /* * Dummy printk for disabled debugging statements to use whilst maintaining - * gcc's format and side-effect checking. + * gcc's format checking. */ -static inline __printf(1, 2) -int no_printk(const char *fmt, ...) -{ - return 0; -} +#define no_printk(fmt, ...) \ +do { \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ +} while (0) #ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) -- cgit v1.2.3