From 710ec38b0f633ab3e2581f07a73442d809e28ab0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Sep 2019 15:32:59 -0700 Subject: mm: add dummy can_do_mlock() helper On kernels without CONFIG_MMU, we get a link error for the siw driver: drivers/infiniband/sw/siw/siw_mem.o: In function `siw_umem_get': siw_mem.c:(.text+0x4c8): undefined reference to `can_do_mlock' This is probably not the only driver that needs the function and could otherwise build correctly without CONFIG_MMU, so add a dummy variant that always returns false. Link: http://lkml.kernel.org/r/20190909204201.931830-1-arnd@arndb.de Fixes: 2251334dcac9 ("rdma/siw: application buffer management") Signed-off-by: Arnd Bergmann Suggested-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Bernard Metzler Cc: "Matthew Wilcox (Oracle)" Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cf955feb823..6e79b3df1582 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1405,7 +1405,11 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +#ifdef CONFIG_MMU extern bool can_do_mlock(void); +#else +static inline bool can_do_mlock(void) { return false; } +#endif extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); -- cgit v1.2.3 From 963abb9aebcdacf5c709a36da9ac0727a33671eb Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 23 Sep 2019 15:33:11 -0700 Subject: jbd2: remove jbd2_journal_inode_add_[write|wait] Since ext4/ocfs2 are using jbd2_inode dirty range scoping APIs now, jbd2_journal_inode_add_[write|wait] are not used any more, remove them. Link: http://lkml.kernel.org/r/1562977611-8412-2-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Ross Zwisler Acked-by: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Joseph Qi Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/jbd2.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index df03825ad1a1..603fbc4e2f70 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); -extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); -- cgit v1.2.3 From 9adeaa226988b97bc15928e12f40a9863134467c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 23 Sep 2019 15:33:49 -0700 Subject: mm, slab: move memcg_cache_params structure to mm/slab.h The memcg_cache_params structure is only embedded into the kmem_cache of slab and slub allocators as defined in slab_def.h and slub_def.h and used internally by mm code. There is no needed to expose it in a public header. So move it from include/linux/slab.h to mm/slab.h. It is just a refactoring patch with no code change. In fact both the slub_def.h and slab_def.h should be moved into the mm directory as well, but that will probably cause many merge conflicts. Link: http://lkml.kernel.org/r/20190718180827.18758-1-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 62 ---------------------------------------------------- 1 file changed, 62 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 56c9c7eed34e..ab2b98ad76e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -595,68 +595,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) return __kmalloc_node(size, flags, node); } -struct memcg_cache_array { - struct rcu_head rcu; - struct kmem_cache *entries[0]; -}; - -/* - * This is the main placeholder for memcg-related information in kmem caches. - * Both the root cache and the child caches will have it. For the root cache, - * this will hold a dynamically allocated array large enough to hold - * information about the currently limited memcgs in the system. To allow the - * array to be accessed without taking any locks, on relocation we free the old - * version only after a grace period. - * - * Root and child caches hold different metadata. - * - * @root_cache: Common to root and child caches. NULL for root, pointer to - * the root cache for children. - * - * The following fields are specific to root caches. - * - * @memcg_caches: kmemcg ID indexed table of child caches. This table is - * used to index child cachces during allocation and cleared - * early during shutdown. - * - * @root_caches_node: List node for slab_root_caches list. - * - * @children: List of all child caches. While the child caches are also - * reachable through @memcg_caches, a child cache remains on - * this list until it is actually destroyed. - * - * The following fields are specific to child caches. - * - * @memcg: Pointer to the memcg this cache belongs to. - * - * @children_node: List node for @root_cache->children list. - * - * @kmem_caches_node: List node for @memcg->kmem_caches list. - */ -struct memcg_cache_params { - struct kmem_cache *root_cache; - union { - struct { - struct memcg_cache_array __rcu *memcg_caches; - struct list_head __root_caches_node; - struct list_head children; - bool dying; - }; - struct { - struct mem_cgroup *memcg; - struct list_head children_node; - struct list_head kmem_caches_node; - struct percpu_ref refcnt; - - void (*work_fn)(struct kmem_cache *); - union { - struct rcu_head rcu_head; - struct work_struct work; - }; - }; - }; -}; - int memcg_update_all_caches(int num_memcgs); /** -- cgit v1.2.3 From a50b854e073cd3335bbbada8dcff83a857297dd7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:25 -0700 Subject: mm: introduce page_size() Patch series "Make working with compound pages easier", v2. These three patches add three helpers and convert the appropriate places to use them. This patch (of 3): It's unnecessarily hard to find out the size of a potentially huge page. Replace 'PAGE_SIZE << compound_order(page)' with page_size(page). Link: http://lkml.kernel.org/r/20190721104612.19120-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Michal Hocko Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edfca4278319..53fc34f930d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, static inline struct hstate *page_hstate(struct page *page) { VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(PAGE_SIZE << compound_order(page)); + return size_to_hstate(page_size(page)); } static inline unsigned hstate_index_to_shift(unsigned index) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e79b3df1582..d46d5585e2a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,12 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of bytes in this potentially compound page. */ +static inline unsigned long page_size(struct page *page) +{ + return PAGE_SIZE << compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU -- cgit v1.2.3 From 94ad9338109fe9d0b8a4a16828719dd6dcaee4c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:28 -0700 Subject: mm: introduce page_shift() Replace PAGE_SHIFT + compound_order(page) with the new page_shift() function. Minor improvements in readability. [akpm@linux-foundation.org: fix build in tce_page_is_contained()] Link: http://lkml.kernel.org/r/201907241853.yNQTrJWd%25lkp@intel.com Link: http://lkml.kernel.org/r/20190721104612.19120-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d46d5585e2a2..9238548bdec5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -811,6 +811,12 @@ static inline unsigned long page_size(struct page *page) return PAGE_SIZE << compound_order(page); } +/* Returns the number of bits needed for the number of bytes in a page */ +static inline unsigned int page_shift(struct page *page) +{ + return PAGE_SHIFT + compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU -- cgit v1.2.3 From d8c6546b1aea843fbeb4d54a1202f1adda6504be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:30 -0700 Subject: mm: introduce compound_nr() Replace 1 << compound_order(page) with compound_nr(page). Minor improvements in readability. Link: http://lkml.kernel.org/r/20190721104612.19120-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9238548bdec5..69b7314c8d24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,12 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of pages in this potentially compound page. */ +static inline unsigned long compound_nr(struct page *page) +{ + return 1UL << compound_order(page); +} + /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { -- cgit v1.2.3 From 37389167a281f3ccb6bc958c32b2e088c7269fe0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:39 -0700 Subject: mm, page_owner: keep owner info when freeing the page For debugging purposes it might be useful to keep the owner info even after page has been freed, and include it in e.g. dump_page() when detecting a bad page state. For that, change the PAGE_EXT_OWNER flag meaning to "page owner info has been set at least once" and add new PAGE_EXT_OWNER_ACTIVE for tracking whether page is supposed to be currently tracked allocated or free. Adjust dump_page() accordingly, distinguishing free and allocated pages. In the page_owner debugfs file, keep printing only allocated pages so that existing scripts are not confused, and also because free pages are irrelevant for the memory statistics or leak detection that's the typical use case of the file, anyway. Link: http://lkml.kernel.org/r/20190820131828.22684-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 09592951725c..682fd465df06 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -18,6 +18,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, + PAGE_EXT_OWNER_ACTIVE, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, -- cgit v1.2.3 From 4101196b19d7f905dca5dcf46cd35eb758cf06c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:52 -0700 Subject: mm: page cache: store only head pages in i_pages Transparent Huge Pages are currently stored in i_pages as pointers to consecutive subpages. This patch changes that to storing consecutive pointers to the head page in preparation for storing huge pages more efficiently in i_pages. Large parts of this are "inspired" by Kirill's patch https://lore.kernel.org/lkml/20170126115819.58875-2-kirill.shutemov@linux.intel.com/ Kirill and Huang Ying contributed several fixes. [willy@infradead.org: use compound_nr, squish uninit-var warning] Link: http://lkml.kernel.org/r/20190731210400.7419-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jan Kara Reviewed-by: Kirill Shutemov Reviewed-by: Song Liu Tested-by: Song Liu Tested-by: William Kucharski Reviewed-by: William Kucharski Tested-by: Qian Cai Tested-by: Mikhail Gavrilov Cc: Hugh Dickins Cc: Chris Wilson Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c7552459a15f..37a4d9e32cd3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + return page + (offset & (compound_nr(page) - 1)); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, -- cgit v1.2.3 From 2d15eb31b50a1b93927bdb7cc5d9960b90f7c1e4 Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Mon, 23 Sep 2019 15:35:04 -0700 Subject: mm/gup: add make_dirty arg to put_user_pages_dirty_lock() [11~From: John Hubbard Subject: mm/gup: add make_dirty arg to put_user_pages_dirty_lock() Patch series "mm/gup: add make_dirty arg to put_user_pages_dirty_lock()", v3. There are about 50+ patches in my tree [2], and I'll be sending out the remaining ones in a few more groups: * The block/bio related changes (Jerome mostly wrote those, but I've had to move stuff around extensively, and add a little code) * mm/ changes * other subsystem patches * an RFC that shows the current state of the tracking patch set. That can only be applied after all call sites are converted, but it's good to get an early look at it. This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). This patch (of 3): Provide more capable variation of put_user_pages_dirty_lock(), and delete put_user_pages_dirty(). This is based on the following: 1. Lots of call sites become simpler if a bool is passed into put_user_page*(), instead of making the call site choose which put_user_page*() variant to call. 2. Christoph Hellwig's observation that set_page_dirty_lock() is usually correct, and set_page_dirty() is usually a bug, or at least questionable, within a put_user_page*() calling chain. This leads to the following API choices: * put_user_pages_dirty_lock(page, npages, make_dirty) * There is no put_user_pages_dirty(). You have to hand code that, in the rare case that it's required. [jhubbard@nvidia.com: remove unused variable in siw_free_plist()] Link: http://lkml.kernel.org/r/20190729074306.10368-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20190724044537.10458-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Matthew Wilcox Cc: Jan Kara Cc: Christoph Hellwig Cc: Ira Weiny Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 69b7314c8d24..57a9fa34f159 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1075,8 +1075,9 @@ static inline void put_user_page(struct page *page) put_page(page); } -void put_user_pages_dirty(struct page **pages, unsigned long npages); -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); + void put_user_pages(struct page **pages, unsigned long npages); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -- cgit v1.2.3 From 13224794cb0832caa403ad583d8605202cabc6bc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Sep 2019 15:35:19 -0700 Subject: mm: remove quicklist page table caches Patch series "mm: remove quicklist page table caches". A while ago Nicholas proposed to remove quicklist page table caches [1]. I've rebased his patch on the curren upstream and switched ia64 and sh to use generic versions of PTE allocation. [1] https://lore.kernel.org/linux-mm/20190711030339.20892-1-npiggin@gmail.com This patch (of 3): Remove page table allocator "quicklists". These have been around for a long time, but have not got much traction in the last decade and are only used on ia64 and sh architectures. The numbers in the initial commit look interesting but probably don't apply anymore. If anybody wants to resurrect this it's in the git history, but it's unhelpful to have this code and divergent allocator behaviour for minor archs. Also it might be better to instead make more general improvements to page allocator if this is still so slow. Link: http://lkml.kernel.org/r/1565250728-21721-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Nicholas Piggin Signed-off-by: Mike Rapoport Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/quicklist.h | 94 ----------------------------------------------- 1 file changed, 94 deletions(-) delete mode 100644 include/linux/quicklist.h (limited to 'include/linux') diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h deleted file mode 100644 index 034982c98c8b..000000000000 --- a/include/linux/quicklist.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_QUICKLIST_H -#define LINUX_QUICKLIST_H -/* - * Fast allocations and disposal of pages. Pages must be in the condition - * as needed after allocation when they are freed. Per cpu lists of pages - * are kept that only contain node local pages. - * - * (C) 2007, SGI. Christoph Lameter - */ -#include -#include -#include - -#ifdef CONFIG_QUICKLIST - -struct quicklist { - void *page; - int nr_pages; -}; - -DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; - -/* - * The two key functions quicklist_alloc and quicklist_free are inline so - * that they may be custom compiled for the platform. - * Specifying a NULL ctor can remove constructor support. Specifying - * a constant quicklist allows the determination of the exact address - * in the per cpu area. - * - * The fast patch in quicklist_alloc touched only a per cpu cacheline and - * the first cacheline of the page itself. There is minmal overhead involved. - */ -static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) -{ - struct quicklist *q; - void **p = NULL; - - q =&get_cpu_var(quicklist)[nr]; - p = q->page; - if (likely(p)) { - q->page = p[0]; - p[0] = NULL; - q->nr_pages--; - } - put_cpu_var(quicklist); - if (likely(p)) - return p; - - p = (void *)__get_free_page(flags | __GFP_ZERO); - if (ctor && p) - ctor(p); - return p; -} - -static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, - struct page *page) -{ - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - *(void **)p = q->page; - q->page = p; - q->nr_pages++; - put_cpu_var(quicklist); -} - -static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) -{ - __quicklist_free(nr, dtor, pp, virt_to_page(pp)); -} - -static inline void quicklist_free_page(int nr, void (*dtor)(void *), - struct page *page) -{ - __quicklist_free(nr, dtor, page_address(page), page); -} - -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free); - -unsigned long quicklist_total_size(void); - -#else - -static inline unsigned long quicklist_total_size(void) -{ - return 0; -} - -#endif - -#endif /* LINUX_QUICKLIST_H */ - -- cgit v1.2.3 From 902ce63b337381092ff865f542e854ff3d0ebe2b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:46 -0700 Subject: driver/base/memory.c: validate memory block size early Let's validate the memory block size early, when initializing the memory device infrastructure. Fail hard in case the value is not suitable. As nobody checks the return value of memory_dev_init(), turn it into a void function and fail with a panic in all scenarios instead. Otherwise, we'll crash later during boot when core/drivers expect that the memory device infrastructure (including memory_block_size_bytes()) works as expected. I think long term, we should move the whole memory block size configuration (set_memory_block_size_order() and memory_block_size_bytes()) into drivers/base/memory.c. Link: http://lkml.kernel.org/r/20190806090142.22709-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 02e633f3ede0..b3d9df060626 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -80,9 +80,9 @@ struct mem_section; #define IPC_CALLBACK_PRI 10 #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int memory_dev_init(void) +static inline void memory_dev_init(void) { - return 0; + return; } static inline int register_memory_notifier(struct notifier_block *nb) { @@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size); void remove_memory_block_devices(unsigned long start, unsigned long size); -extern int memory_dev_init(void); +extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); -- cgit v1.2.3 From b6c88d3b9d38f9448e0fcf44847a075ea81d5ca2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:49 -0700 Subject: drivers/base/memory.c: don't store end_section_nr in memory blocks Each memory block spans the same amount of sections/pages/bytes. The size is determined before the first memory block is created. No need to store what we can easily calculate - and the calculations even look simpler now. Michal brought up the idea of variable-sized memory blocks. However, if we ever implement something like this, we will need an API compatibility switch and reworks at various places (most code assumes a fixed memory block size). So let's cleanup what we have right now. While at it, fix the variable naming in register_mem_sect_under_node() - we no longer talk about a single section. Link: http://lkml.kernel.org/r/20190809110200.2746-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index b3d9df060626..0ebb105eb261 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,6 @@ struct memory_block { unsigned long start_section_nr; - unsigned long end_section_nr; unsigned long state; /* serialized by the dev->lock */ int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ -- cgit v1.2.3 From 688fcbfc06e4fdfbb7e1d5a942a1460fe6379d2d Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Mon, 23 Sep 2019 15:36:39 -0700 Subject: mm/vmalloc: modify struct vmap_area to reduce its size Objective --------- The current implementation of struct vmap_area wasted space. After applying this commit, sizeof(struct vmap_area) has been reduced from 11 words to 8 words. Description ----------- 1) Pack "subtree_max_size", "vm" and "purge_list". This is no problem because A) "subtree_max_size" is only used when vmap_area is in "free" tree B) "vm" is only used when vmap_area is in "busy" tree C) "purge_list" is only used when vmap_area is in vmap_purge_list 2) Eliminate "flags". ;Since only one flag VM_VM_AREA is being used, and the same thing can be done by judging whether "vm" is NULL, then the "flags" can be eliminated. Link: http://lkml.kernel.org/r/20190716152656.12255-3-lpf.vector@gmail.com Signed-off-by: Pengfei Li Suggested-by: Uladzislau Rezki (Sony) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Roman Gushchin Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dfa718ffdd4f..4e7809408073 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -53,15 +53,21 @@ struct vmap_area { unsigned long va_start; unsigned long va_end; - /* - * Largest available free size in subtree. - */ - unsigned long subtree_max_size; - unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ - struct llist_node purge_list; /* "lazy purge" list */ - struct vm_struct *vm; + + /* + * The following three variables can be packed, because + * a vmap_area object is always one of the three states: + * 1) in "free" tree (root is vmap_area_root) + * 2) in "busy" tree (root is free_vmap_area_root) + * 3) in purge list (head is vmap_purge_list) + */ + union { + unsigned long subtree_max_size; /* in "free" tree */ + struct vm_struct *vm; /* in "busy" tree */ + struct llist_node purge_list; /* in purge list */ + }; }; /* -- cgit v1.2.3 From 494330855641269c8a49f1580f0d4e2ead693245 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:37:32 -0700 Subject: mm, compaction: raise compaction priority after it withdrawns Mike Kravetz reports that "hugetlb allocations could stall for minutes or hours when should_compact_retry() would return true more often then it should. Specifically, this was in the case where compact_result was COMPACT_DEFERRED and COMPACT_PARTIAL_SKIPPED and no progress was being made." The problem is that the compaction_withdrawn() test in should_compact_retry() includes compaction outcomes that are only possible on low compaction priority, and results in a retry without increasing the priority. This may result in furter reclaim, and more incomplete compaction attempts. With this patch, compaction priority is raised when possible, or should_compact_retry() returns false. The COMPACT_SKIPPED result doesn't really fit together with the other outcomes in compaction_withdrawn(), as that's a result caused by insufficient order-0 pages, not due to low compaction priority. With this patch, it is moved to a new compaction_needs_reclaim() function, and for that outcome we keep the current logic of retrying if it looks like reclaim will be able to help. Link: http://lkml.kernel.org/r/20190806014744.15446-4-mike.kravetz@oracle.com Reported-by: Mike Kravetz Signed-off-by: Vlastimil Babka Signed-off-by: Mike Kravetz Tested-by: Mike Kravetz Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9569e7c786d3..4b898cdbdf05 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result) return false; } -/* - * Compaction has backed off for some reason. It might be throttling or - * lock contention. Retrying is still worthwhile. - */ -static inline bool compaction_withdrawn(enum compact_result result) +/* Compaction needs reclaim to be performed first, so it can continue. */ +static inline bool compaction_needs_reclaim(enum compact_result result) { /* * Compaction backed off due to watermark checks for order-0 @@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result) if (result == COMPACT_SKIPPED) return true; + return false; +} + +/* + * Compaction has backed off for some reason after doing some work or none + * at all. It might be throttling or lock contention. Retrying might be still + * worthwhile, but with a higher priority if allowed. + */ +static inline bool compaction_withdrawn(enum compact_result result) +{ /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result) return false; } +static inline bool compaction_needs_reclaim(enum compact_result result) +{ + return false; +} + static inline bool compaction_withdrawn(enum compact_result result) { return true; -- cgit v1.2.3 From 60fbf0ab5da1c360e02b7f7d882bf1c0d8f7e32a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:54 -0700 Subject: mm,thp: stats for file backed THP In preparation for non-shmem THP, this patch adds a few stats and exposes them in /proc/meminfo, /sys/bus/node/devices//meminfo, and /proc//task//smaps. This patch is mostly a rewrite of Kirill A. Shutemov's earlier version: https://lkml.kernel.org/r/20170126115819.58875-5-kirill.shutemov@linux.intel.com/ Link: http://lkml.kernel.org/r/20190801184244.3169074-5-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f38c30d2f13..aafb7c38c627 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -235,6 +235,8 @@ enum node_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, + NR_FILE_THPS, + NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, -- cgit v1.2.3 From 09d91cda0e8207c1f14ee0d572f61a53dbcdaf85 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:03 -0700 Subject: mm,thp: avoid writes to file with THP in pagecache In previous patch, an application could put part of its text section in THP via madvise(). These THPs will be protected from writes when the application is still running (TXTBSY). However, after the application exits, the file is available for writes. This patch avoids writes to file THP by dropping page cache for the file when the file is open for write. A new counter nr_thps is added to struct address_space. In do_dentry_open(), if the file is open for write and nr_thps is non-zero, we drop page cache for the whole file. Link: http://lkml.kernel.org/r/20190801184244.3169074-8-songliubraving@fb.com Signed-off-by: Song Liu Reported-by: kbuild test robot Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 866268c2c6e3..b0c6b0d34d02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_pages: Cached pages. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED mappings. + * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. @@ -446,6 +447,10 @@ struct address_space { struct xarray i_pages; gfp_t gfp_mask; atomic_t i_mmap_writable; +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + /* number of thp, only for non-shmem files */ + atomic_t nr_thps; +#endif struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; @@ -2798,6 +2803,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); -- cgit v1.2.3 From 364c1eebe453f06f0c1e837eb155a5725c9cd272 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:06 -0700 Subject: mm: thp: extract split_queue_* into a struct Patch series "Make deferred split shrinker memcg aware", v6. Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. Make deferred split shrinker not depend on memcg kmem since it is not slab. It doesn't make sense to not shrink THP even though memcg kmem is disabled. With the above change the test demonstrated above doesn't trigger OOM even though with cgroup.memory=nokmem. This patch (of 4): Put split_queue, split_queue_lock and split_queue_len into a struct in order to reduce code duplication when we convert deferred_split to memcg aware in the later patches. Link: http://lkml.kernel.org/r/1565144277-36240-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: "Kirill A . Shutemov" Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aafb7c38c627..bda20282746b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -679,6 +679,14 @@ struct zonelist { extern struct page *mem_map; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct deferred_split { + spinlock_t split_queue_lock; + struct list_head split_queue; + unsigned long split_queue_len; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -758,9 +766,7 @@ typedef struct pglist_data { #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; + struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ -- cgit v1.2.3 From 0a432dcbeb32edcd211a5d8f7847d0da7642a8b4 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:12 -0700 Subject: mm: shrinker: make shrinker not depend on memcg kmem Currently shrinker is just allocated and can work when memcg kmem is enabled. But, THP deferred split shrinker is not slab shrinker, it doesn't make too much sense to have such shrinker depend on memcg kmem. It should be able to reclaim THP even though memcg kmem is disabled. Introduce a new shrinker flag, SHRINKER_NONSLAB, for non-slab shrinker. When memcg kmem is disabled, just such shrinkers can be called in shrinking memcg slab. [yang.shi@linux.alibaba.com: add comment] Link: http://lkml.kernel.org/r/1566496227-84952-4-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1565144277-36240-4-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++-------- include/linux/shrinker.h | 7 ++++++- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad8f1a397ae4..a3c0a639c824 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -128,9 +128,8 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; -#ifdef CONFIG_MEMCG_KMEM struct memcg_shrinker_map __rcu *shrinker_map; -#endif + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -1311,6 +1310,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } while ((memcg = parent_mem_cgroup(memcg))); return false; } + +extern int memcg_expand_shrinker_maps(int new_id); + +extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1319,6 +1323,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } + +static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) +{ +} #endif struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); @@ -1390,10 +1399,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); #else static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) @@ -1435,8 +1440,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) { } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 9443cafd1969..0f80123650e2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -69,7 +69,7 @@ struct shrinker { /* These are for internal use */ struct list_head list; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; #endif @@ -81,6 +81,11 @@ struct shrinker { /* Flags */ #define SHRINKER_NUMA_AWARE (1 << 0) #define SHRINKER_MEMCG_AWARE (1 << 1) +/* + * It just makes sense when the shrinker is also MEMCG_AWARE for now, + * non-MEMCG_AWARE shrinker should not have this flag set. + */ +#define SHRINKER_NONSLAB (1 << 2) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); -- cgit v1.2.3 From 87eaceb3faa59b9b4d940ec9554ce251325d83fe Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:15 -0700 Subject: mm: thp: make deferred split shrinker memcg aware Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. [yang.shi@linux.alibaba.com: simplify deferred split queue dereference per Kirill Tkhai] Link: http://lkml.kernel.org/r/1566496227-84952-5-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1565144277-36240-5-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 9 +++++++++ include/linux/memcontrol.h | 4 ++++ include/linux/mm_types.h | 1 + 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..61c9ffd89b05 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * Global or memcg deferred list in the second tail pages is + * occupied by compound_head. + */ + return &page[2].deferred_list; +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3c0a639c824..9b60863429cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -330,6 +330,10 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct deferred_split deferred_split_queue; +#endif + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b739f360cec..5183e0d77dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -138,6 +138,7 @@ struct page { struct { /* Second tail page of compound page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; + /* For both global and memcg */ struct list_head deferred_list; }; struct { /* Page table pages */ -- cgit v1.2.3 From 010c164a5fa7e169deab0a4d8211611f1930c1cd Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:19 -0700 Subject: mm: move memcmp_pages() and pages_identical() Patch series "THP aware uprobe", v13. This patchset makes uprobe aware of THPs. Currently, when uprobe is attached to text on THP, the page is split by FOLL_SPLIT. As a result, uprobe eliminates the performance benefit of THP. This set makes uprobe THP-aware. Instead of FOLL_SPLIT, we introduces FOLL_SPLIT_PMD, which only split PMD for uprobe. After all uprobes within the THP are removed, the PTE-mapped pages are regrouped as huge PMD. This set (plus a few THP patches) is also available at https://github.com/liu-song-6/linux/tree/uprobe-thp This patch (of 6): Move memcmp_pages() to mm/util.c and pages_identical() to mm.h, so that we can use them in other files. Link: http://lkml.kernel.org/r/20190815164525.1848545-2-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reviewed-by: Oleg Nesterov Cc: Johannes Weiner Cc: Matthew Wilcox Cc: William Kucharski Cc: Srikar Dronamraju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57a9fa34f159..0ac87d9ee9d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2868,5 +2868,12 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From bfe7b00de6d1e25fee08484c4fbf1c1ed175be78 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:25 -0700 Subject: mm, thp: introduce FOLL_SPLIT_PMD Introduce a new foll_flag: FOLL_SPLIT_PMD. As the name says FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge page stays as-is. FOLL_SPLIT_PMD is useful for cases where we need to use regular pages, but would switch back to huge page and huge pmd on. One of such example is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe. Link: http://lkml.kernel.org/r/20190815164525.1848545-4-songliubraving@fb.com Signed-off-by: Song Liu Reviewed-by: Oleg Nesterov Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ac87d9ee9d8..233ad11938db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2591,6 +2591,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ /* * NOTE on FOLL_LONGTERM: -- cgit v1.2.3 From 27e1f8273113adec0e98bf513e4091636b27cc2a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:30 -0700 Subject: khugepaged: enable collapse pmd for pte-mapped THP khugepaged needs exclusive mmap_sem to access page table. When it fails to lock mmap_sem, the page will fault in as pte-mapped THP. As the page is already a THP, khugepaged will not handle this pmd again. This patch enables the khugepaged to retry collapse the page table. struct mm_slot (in khugepaged.c) is extended with an array, containing addresses of pte-mapped THPs. We use array here for simplicity. We can easily replace it with more advanced data structures when needed. In khugepaged_scan_mm_slot(), if the mm contains pte-mapped THP, we try to collapse the page table. Since collapse may happen at an later time, some pages may already fault in. collapse_pte_mapped_thp() is added to properly handle these pages. collapse_pte_mapped_thp() also double checks whether all ptes in this pmd are mapping to the same THP. This is necessary because some subpage of the THP may be replaced, for example by uprobe. In such cases, it is not possible to collapse the pmd. [kirill.shutemov@linux.intel.com: add comments for retract_page_tables()] Link: http://lkml.kernel.org/r/20190816145443.6ard3iilytc6jlgv@box Link: http://lkml.kernel.org/r/20190815164525.1848545-6-songliubraving@fb.com Signed-off-by: Song Liu Signed-off-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Suggested-by: Johannes Weiner Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 082d1d2a5216..bc45ea1efbf7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +#ifdef CONFIG_SHMEM +extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +#else +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} +#endif #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, { return 0; } +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ -- cgit v1.2.3 From 649775be63c8b2e0b56ecc5bbc96d38205ec5259 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:37 -0700 Subject: mm, fs: move randomize_stack_top from fs to mm Patch series "Provide generic top-down mmap layout functions", v6. This series introduces generic functions to make top-down mmap layout easily accessible to architectures, in particular riscv which was the initial goal of this series. The generic implementation was taken from arm64 and used successively by arm, mips and finally riscv. Note that in addition the series fixes 2 issues: - stack randomization was taken into account even if not necessary. - [1] fixed an issue with mmap base which did not take into account randomization but did not report it to arm and mips, so by moving arm64 into a generic library, this problem is now fixed for both architectures. This work is an effort to factorize architecture functions to avoid code duplication and oversights as in [1]. [1]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1429066.html This patch (of 14): This preparatory commit moves this function so that further introduction of generic topdown mmap layout is contained only in mm/util.c. Link: http://lkml.kernel.org/r/20190730055113.23635-2-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 233ad11938db..294a67b94147 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2328,6 +2328,8 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); +unsigned long randomize_stack_top(unsigned long stack_top); + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, -- cgit v1.2.3 From c165f25d23ecb2f9f121ced20435415b931219e2 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Mon, 23 Sep 2019 15:39:37 -0700 Subject: zpool: add malloc_support_movable to zpool_driver As a zpool_driver, zsmalloc can allocate movable memory because it support migate pages. But zbud and z3fold cannot allocate movable memory. Add malloc_support_movable to zpool_driver. If a zpool_driver support allocate movable memory, set it to true. And add zpool_malloc_support_movable check malloc_support_movable to make sure if a zpool support allocate movable memory. Link: http://lkml.kernel.org/r/20190605100630.13293-1-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Shakeel Butt Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 7238865e75b0..51bf43076165 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); +bool zpool_malloc_support_movable(struct zpool *pool); + int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -90,6 +92,7 @@ struct zpool_driver { struct zpool *zpool); void (*destroy)(void *pool); + bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); -- cgit v1.2.3