From 7716506adac4664793a9d6d3dfa31ffddfa98714 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:45 -0700 Subject: mm: introduce and use mapping_empty() Patch series "Remove nrexceptional tracking", v2. We actually use nrexceptional for very little these days. It's a minor pain to keep in sync with nrpages, but the pain becomes much bigger with the THP patches because we don't know how many indices a shadow entry occupies. It's easier to just remove it than keep it accurate. Also, we save 8 bytes per inode which is nothing to sneeze at; on my laptop, it would improve shmem_inode_cache from 22 to 23 objects per 16kB, and inode_cache from 26 to 27 objects. Combined, that saves a megabyte of memory from a combined usage of 25MB for both caches. Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save any memory for ext4. This patch (of 4): Instead of checking the two counters (nrpages and nrexceptional), we can just check whether i_pages is empty. Link: https://lkml.kernel.org/r/20201026151849.24232-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201026151849.24232-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 469fa7ffcf96..a4bd41128bf3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,11 @@ struct pagevec; +static inline bool mapping_empty(struct address_space *mapping) +{ + return xa_empty(&mapping->i_pages); +} + /* * Bits in mapping->flags. */ -- cgit v1.2.3 From 8bc3c481b3d0dcef2cf8e1b7c6b780af6725f7e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:54 -0700 Subject: mm: remove nrexceptional from inode We no longer track anything in nrexceptional, so remove it, saving 8 bytes per inode. Link: https://lkml.kernel.org/r/20201026151849.24232-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 12766edee81f..acef282b97c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. - * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). @@ -463,7 +462,6 @@ struct address_space { struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; - unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; -- cgit v1.2.3 From aec44e0f0213e36d4f0868a80cdc5097a510f79d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:00 -0700 Subject: hugetlb: pass vma into huge_pte_alloc() and huge_pmd_share() Patch series "hugetlb: Disable huge pmd unshare for uffd-wp", v4. This series tries to disable huge pmd unshare of hugetlbfs backed memory for uffd-wp. Although uffd-wp of hugetlbfs is still during rfc stage, the idea of this series may be needed for multiple tasks (Axel's uffd minor fault series, and Mike's soft dirty series), so I picked it out from the larger series. This patch (of 4): It is a preparation work to be able to behave differently in the per architecture huge_pte_alloc() according to different VMA attributes. Pass it deeper into huge_pmd_share() so that we can avoid the find_vma() call. [peterx@redhat.com: build fix] Link: https://lkml.kernel.org/r/20210304164653.GB397383@xz-x1Link: https://lkml.kernel.org/r/20210218230633.15028-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218230633.15028-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..653ef322fac9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -161,7 +162,7 @@ extern struct list_head huge_boot_pages; /* arch callbacks */ -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); -- cgit v1.2.3 From c1991e0705d143be773c984b006f2078aa9f2853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:04 -0700 Subject: hugetlb/userfaultfd: forbid huge pmd sharing when uffd enabled Huge pmd sharing could bring problem to userfaultfd. The thing is that userfaultfd is running its logic based on the special bits on page table entries, however the huge pmd sharing could potentially share page table entries for different address ranges. That could cause issues on either: - When sharing huge pmd page tables for an uffd write protected range, the newly mapped huge pmd range will also be write protected unexpectedly, or, - When we try to write protect a range of huge pmd shared range, we'll first do huge_pmd_unshare() in hugetlb_change_protection(), however that also means the UFFDIO_WRITEPROTECT could be silently skipped for the shared region, which could lead to data loss. While at it, a few other things are done altogether: - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because that's definitely something that arch code would like to use too - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE when trying to share huge pmd. Switch to the want_pmd_share() helper. - Move vma_shareable() from huge_pmd_share() into want_pmd_share(). [peterx@redhat.com: fix build with !ARCH_WANT_HUGE_PMD_SHARE] Link: https://lkml.kernel.org/r/20210310185359.88297-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218231202.15426-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Tested-by: Naresh Kamboju Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 ++ include/linux/userfaultfd_k.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 653ef322fac9..88e93809a455 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1040,4 +1040,6 @@ static inline __init void hugetlb_cma_check(void) } #endif +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a8e5f3ea9bb2..c63ccdae3eab 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,15 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } +/* + * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp + * protect information is per pgtable entry. + */ +static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_WP; +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; -- cgit v1.2.3 From 537cf30bba241ae88d5f4b0b6a5e66271b394852 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:08 -0700 Subject: mm/hugetlb: move flush_hugetlb_tlb_range() into hugetlb.h Prepare for it to be called outside of mm/hugetlb.c. Link: https://lkml.kernel.org/r/20210218231204.15474-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 88e93809a455..e43668144664 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1042,4 +1042,12 @@ static inline __init void hugetlb_cma_check(void) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); +#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +/* + * ARCHes with special requirements for evicting HUGETLB backing TLB entries can + * implement this. + */ +#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + #endif /* _LINUX_HUGETLB_H */ -- cgit v1.2.3 From 6dfeaff93be1a4cab4fb48dad7df326d05059a99 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:13 -0700 Subject: hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because userfaultfd-wp is always based on pgtable entries, so they cannot be shared. Walk the hugetlb range and unshare all such mappings if there is, right before UFFDIO_REGISTER will succeed and return to userspace. This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing is completely disabled for userfaultfd-wp registered range. Link: https://lkml.kernel.org/r/20210218231206.15524-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e43668144664..0f5813522224 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -369,6 +370,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, return 0; } +static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support -- cgit v1.2.3 From d4afd60c24f87b6275b12ec3d67d8c2ad78cb075 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:05 -0700 Subject: mm/huge_memory.c: remove unused macro TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG Commit 4958e4d86ecb ("mm: thp: remove debug_cow switch") forgot to remove TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG macro. Remove it here. Link: https://lkml.kernel.org/r/20210318122722.13135-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ba973efcd369..9626fda5efce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -87,9 +87,6 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, -#ifdef CONFIG_DEBUG_VM - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, -#endif }; struct kobject; -- cgit v1.2.3 From 2938396771c8fd0870b5284319f9e78b4b552a79 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:52 -0700 Subject: hugetlb: add per-hstate mutex to synchronize user adjustments The helper routine hstate_next_node_to_alloc accesses and modifies the hstate variable next_nid_to_alloc. The helper is used by the routines alloc_pool_huge_page and adjust_pool_surplus. adjust_pool_surplus is called with hugetlb_lock held. However, alloc_pool_huge_page can not be called with the hugetlb lock held as it will call the page allocator. Two instances of alloc_pool_huge_page could be run in parallel or alloc_pool_huge_page could run in parallel with adjust_pool_surplus which may result in the variable next_nid_to_alloc becoming invalid for the caller and pages being allocated on the wrong node. Both alloc_pool_huge_page and adjust_pool_surplus are only called from the routine set_max_huge_pages after boot. set_max_huge_pages is only called as the reusult of a user writing to the proc/sysfs nr_hugepages, or nr_hugepages_mempolicy file to adjust the number of hugetlb pages. It makes little sense to allow multiple adjustment to the number of hugetlb pages in parallel. Add a mutex to the hstate and use it to only allow one hugetlb page adjustment at a time. This will synchronize modifications to the next_nid_to_alloc variable. Link: https://lkml.kernel.org/r/20210409205254.242291-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0f5813522224..628639422c5d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -559,6 +559,7 @@ HPAGEFLAG(Freed, freed) #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { + struct mutex resize_lock; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; -- cgit v1.2.3 From 369fa227c21949b22fd7374506c4992a0d7bb580 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:26 -0700 Subject: mm: make alloc_contig_range handle free hugetlb pages alloc_contig_range will fail if it ever sees a HugeTLB page within the range we are trying to allocate, even when that page is free and can be easily reallocated. This has proved to be problematic for some users of alloc_contic_range, e.g: CMA and virtio-mem, where those would fail the call even when those pages lay in ZONE_MOVABLE and are free. We can do better by trying to replace such page. Free hugepages are tricky to handle so as to no userspace application notices disruption, we need to replace the current free hugepage with a new one. In order to do that, a new function called alloc_and_dissolve_huge_page is introduced. This function will first try to get a new fresh hugepage, and if it succeeds, it will replace the old one in the free hugepage pool. The free page replacement is done under hugetlb_lock, so no external users of hugetlb will notice the change. To allocate the new huge page, we use alloc_buddy_huge_page(), so we do not have to deal with any counters, and prep_new_huge_page() is not called. This is valulable because in case we need to free the new page, we only need to call __free_pages(). Once we know that the page to be replaced is a genuine 0-refcounted huge page, we remove the old page from the freelist by remove_hugetlb_page(). Then, we can call __prep_new_huge_page() and __prep_account_new_huge_page() for the new huge page to properly initialize it and increment the hstate->nr_huge_pages counter (previously decremented by remove_hugetlb_page()). Once done, the page is enqueued by enqueue_huge_page() and it is ready to be used. There is one tricky case when page's refcount is 0 because it is in the process of being released. A missing PageHugeFreed bit will tell us that freeing is in flight so we retry after dropping the hugetlb_lock. The race window should be small and the next retry should make a forward progress. E.g: CPU0 CPU1 free_huge_page() isolate_or_dissolve_huge_page PageHuge() == T alloc_and_dissolve_huge_page alloc_buddy_huge_page() spin_lock_irq(hugetlb_lock) // PageHuge() && !PageHugeFreed && // !PageCount() spin_unlock_irq(hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) update_and_free_page PageHuge() == F __free_pages() 2) enqueue_huge_page SetPageHugeFreed() spin_unlock_irq(&hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) PageHuge() == F (freed by case#1 from CPU0) 2) PageHuge() == T PageHugeFreed() == T - proceed with replacing the page In the case above we retry as the window race is quite small and we have high chances to succeed next time. With regard to the allocation, we restrict it to the node the page belongs to with __GFP_THISNODE, meaning we do not fallback on other node's zones. Note that gigantic hugetlb pages are fenced off since there is a cyclic dependency between them and alloc_contig_range. Link: https://lkml.kernel.org/r/20210419075413.1064-6-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 628639422c5d..ec6a10b8860a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,6 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; +int isolate_or_dissolve_huge_page(struct page *page); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -870,6 +871,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline int isolate_or_dissolve_huge_page(struct page *page) +{ + return -ENOMEM; +} + static inline struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -- cgit v1.2.3 From ae37c7ff79f1f030e28ec76c46ee032f8fd07607 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:29 -0700 Subject: mm: make alloc_contig_range handle in-use hugetlb pages alloc_contig_range() will fail if it finds a HugeTLB page within the range, without a chance to handle them. Since HugeTLB pages can be migrated as any LRU or Movable page, it does not make sense to bail out without trying. Enable the interface to recognize in-use HugeTLB pages so we can migrate them, and have much better chances to succeed the call. Link: https://lkml.kernel.org/r/20210419075413.1064-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec6a10b8860a..d0f310ae3f82 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,7 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; -int isolate_or_dissolve_huge_page(struct page *page); +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -871,7 +871,8 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -static inline int isolate_or_dissolve_huge_page(struct page *page) +static inline int isolate_or_dissolve_huge_page(struct page *page, + struct list_head *list) { return -ENOMEM; } -- cgit v1.2.3 From 7677f7fd8be76659cd2d0db8ff4093bbb51c20e5 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:36 -0700 Subject: userfaultfd: add minor fault registration mode Patch series "userfaultfd: add minor fault handling", v9. Overview ======== This series adds a new userfaultfd feature, UFFD_FEATURE_MINOR_HUGETLBFS. When enabled (via the UFFDIO_API ioctl), this feature means that any hugetlbfs VMAs registered with UFFDIO_REGISTER_MODE_MISSING will *also* get events for "minor" faults. By "minor" fault, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s) (shared memory). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. We also add a new ioctl to resolve such faults: UFFDIO_CONTINUE. The idea is, userspace resolves the fault by either a) doing nothing if the contents are already correct, or b) updating the underlying contents using the second, non-UFFD mapping (via memcpy/memset or similar, or something fancier like RDMA, or etc...). In either case, userspace issues UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Use Case ======== Consider the use case of VM live migration (e.g. under QEMU/KVM): 1. While a VM is still running, we copy the contents of its memory to a target machine. The pages are populated on the target by writing to the non-UFFD mapping, using the setup described above. The VM is still running (and therefore its memory is likely changing), so this may be repeated several times, until we decide the target is "up to date enough". 2. We pause the VM on the source, and start executing on the target machine. During this gap, the VM's user(s) will *see* a pause, so it is desirable to minimize this window. 3. Between the last time any page was copied from the source to the target, and when the VM was paused, the contents of that page may have changed - and therefore the copy we have on the target machine is out of date. Although we can keep track of which pages are out of date, for VMs with large amounts of memory, it is "slow" to transfer this information to the target machine. We want to resume execution before such a transfer would complete. 4. So, the guest begins executing on the target machine. The first time it touches its memory (via the UFFD-registered mapping), userspace wants to intercept this fault. Userspace checks whether or not the page is up to date, and if not, copies the updated page from the source machine, via the non-UFFD mapping. Finally, whether a copy was performed or not, userspace issues a UFFDIO_CONTINUE ioctl to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". We don't have to do all of the final updates on-demand. The userfaultfd manager can, in the background, also copy over updated pages once it receives the map of which pages are up-to-date or not. Interaction with Existing APIs ============================== Because this is a feature, a registered VMA could potentially receive both missing and minor faults. I spent some time thinking through how the existing API interacts with the new feature: UFFDIO_CONTINUE cannot be used to resolve non-minor faults, as it does not allocate a new page. If UFFDIO_CONTINUE is used on a non-minor fault: - For non-shared memory or shmem, -EINVAL is returned. - For hugetlb, -EFAULT is returned. UFFDIO_COPY and UFFDIO_ZEROPAGE cannot be used to resolve minor faults. Without modifications, the existing codepath assumes a new page needs to be allocated. This is okay, since userspace must have a second non-UFFD-registered mapping anyway, thus there isn't much reason to want to use these in any case (just memcpy or memset or similar). - If UFFDIO_COPY is used on a minor fault, -EEXIST is returned. - If UFFDIO_ZEROPAGE is used on a minor fault, -EEXIST is returned (or -EINVAL in the case of hugetlb, as UFFDIO_ZEROPAGE is unsupported in any case). - UFFDIO_WRITEPROTECT simply doesn't work with shared memory, and returns -ENOENT in that case (regardless of the kind of fault). Future Work =========== This series only supports hugetlbfs. I have a second series in flight to support shmem as well, extending the functionality. This series is more mature than the shmem support at this point, and the functionality works fully on hugetlbfs, so this series can be merged first and then shmem support will follow. This patch (of 6): This feature allows userspace to intercept "minor" faults. By "minor" faults, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. This commit adds the new registration mode, and sets the relevant flag on the VMAs being registered. In the hugetlb fault path, if we find that we have huge_pte_none(), but find_lock_page() does indeed find an existing page, then we have a "minor" fault, and if the VMA has the userfaultfd registration flag, we call into userfaultfd to handle it. This is implemented as a new registration mode, instead of an API feature. This is because the alternative implementation has significant drawbacks [1]. However, doing it this was requires we allocate a VM_* flag for the new registration mode. On 32-bit systems, there are no unused bits, so this feature is only supported on architectures with CONFIG_ARCH_USES_HIGH_VMA_FLAGS. When attempting to register a VMA in MINOR mode on 32-bit architectures, we return -EINVAL. [1] https://lore.kernel.org/patchwork/patch/1380226/ [peterx@redhat.com: fix minor fault page leak] Link: https://lkml.kernel.org/r/20210322175132.36659-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210301222728.176417-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20210301222728.176417-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Chinwen Chang Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Xu Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Rostedt Cc: Steven Price Cc: Vlastimil Babka Cc: Adam Ruprecht Cc: Axel Rasmussen Cc: Cannon Matthews Cc: "Dr . David Alan Gilbert" Cc: David Rientjes Cc: Mina Almasry Cc: Oliver Upton Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ include/linux/userfaultfd_k.h | 15 ++++++++++++++- include/trace/events/mmflags.h | 7 +++++++ include/uapi/linux/userfaultfd.h | 15 +++++++++++++-- 4 files changed, 41 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 011f43605807..1dbb53c44243 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c63ccdae3eab..0390e5ac63b3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -17,6 +17,9 @@ #include #include +/* The set of all possible UFFD-related VM flags. */ +#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want @@ -71,6 +74,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_WP; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_MINOR; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { @@ -85,7 +93,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, static inline bool userfaultfd_armed(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); + return vma->vm_flags & __VM_UFFD_FLAGS; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); @@ -132,6 +140,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return false; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..629c7a0eaff2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name}, +#else +# define IF_HAVE_UFFD_MINOR(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) {VM_MAYSHARE, "mayshare" }, \ {VM_GROWSDOWN, "growsdown" }, \ {VM_UFFD_MISSING, "uffd_missing" }, \ +IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 5f2d88212f7c..f24dd4fcbad9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,15 +19,19 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ - UFFD_FEATURE_THREAD_ID) + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -127,6 +131,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -171,6 +176,10 @@ struct uffdio_api { * * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -181,6 +190,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_UNMAP (1<<6) #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) __u64 features; __u64 ioctls; @@ -195,6 +205,7 @@ struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* -- cgit v1.2.3 From 0d9cadabd193c6008d256533f544de8206fd3a80 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:40 -0700 Subject: userfaultfd: disable huge PMD sharing for MINOR registered VMAs As the comment says: for the MINOR fault use case, although the page might be present and populated in the other (non-UFFD-registered) half of the mapping, it may be out of date, and we explicitly want userspace to get a minor fault so it can check and potentially update the page's contents. Huge PMD sharing would prevent these faults from occurring for suitably aligned areas, so disable it upon UFFD registration. Link: https://lkml.kernel.org/r/20210301222728.176417-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0390e5ac63b3..e060d5f77cc5 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -56,12 +56,19 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, } /* - * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp - * protect information is per pgtable entry. + * Never enable huge pmd sharing on some uffd registered vmas: + * + * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. + * + * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for + * VMAs which share huge pmds. (If you have two mappings to the same + * underlying pages, and fault in the non-UFFD-registered one with a write, + * with huge pmd sharing this would *also* setup the second UFFD-registered + * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { - return vma->vm_flags & VM_UFFD_WP; + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) -- cgit v1.2.3 From 714c189108244f1df579689061db1d785d92e7e2 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:45 -0700 Subject: userfaultfd: hugetlbfs: only compile UFFD helpers if config enabled For background, mm/userfaultfd.c provides a general mcopy_atomic implementation. But some types of memory (i.e., hugetlb and shmem) need a slightly different implementation, so they provide their own helpers for this. In other words, userfaultfd is the only caller of these functions. This patch achieves two things: 1. Don't spend time compiling code which will end up never being referenced anyway (a small build time optimization). 2. In patches later in this series, we extend the signature of these helpers with UFFD-specific state (a mode enumeration). Once this happens, we *have to* either not compile the helpers, or unconditionally define the UFFD-only state (which seems messier to me). This includes the declarations in the headers, as otherwise they'd yield warnings about implicitly defining the type of those arguments. Link: https://lkml.kernel.org/r/20210301222728.176417-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Mike Kravetz Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d0f310ae3f82..a1dbe4568707 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -134,11 +134,13 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +#ifdef CONFIG_USERFAULTFD int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); +#endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -310,6 +312,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, BUG(); } +#ifdef CONFIG_USERFAULTFD static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, @@ -320,6 +323,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, BUG(); return 0; } +#endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) -- cgit v1.2.3 From f619147104c8ea71e120e4936d2b68ec11a1e527 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:49 -0700 Subject: userfaultfd: add UFFDIO_CONTINUE ioctl This ioctl is how userspace ought to resolve "minor" userfaults. The idea is, userspace is notified that a minor fault has occurred. It might change the contents of the page using its second non-UFFD mapping, or not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in the minor fault case, we already have some pre-existing underlying page. Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping. We'd just use memcpy() or similar instead. It turns out hugetlb_mcopy_atomic_pte() already does very close to what we want, if an existing page is provided via `struct page **pagep`. We already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so just extend that design: add an enum for the three modes of operation, and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE case. (Basically, look up the existing page, and avoid adding the existing page to the page cache or calling set_page_huge_active() on it.) Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ include/linux/userfaultfd_k.h | 18 ++++++++++++++++++ include/uapi/linux/userfaultfd.h | 21 +++++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a1dbe4568707..b92f25ccef58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, @@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { BUG(); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e060d5f77cc5..794d1538b8ba 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* + * The mode of operation for __mcopy_atomic and its helpers. + * + * This is almost an implementation detail (mcopy_atomic below doesn't take this + * as a parameter), but it's exposed here because memory-kind-specific + * implementations (e.g. hugetlbfs) need to know the mode of operation. + */ +enum mcopy_atomic_mode { + /* A normal copy_from_user into the destination range. */ + MCOPY_ATOMIC_NORMAL, + /* Don't copy; map the destination range to the zero page. */ + MCOPY_ATOMIC_ZEROPAGE, + /* Just install pte(s) with the existing page(s) in the page cache. */ + MCOPY_ATOMIC_CONTINUE, +}; + extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode); @@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, bool *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, bool *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index f24dd4fcbad9..bafbeb1a2624 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,10 +40,12 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ - (__u64)1 << _UFFDIO_WRITEPROTECT) + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY) + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_CONTINUE) /* * Valid ioctl command number range with this API is from 0x00 to @@ -59,6 +61,7 @@ #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -77,6 +80,8 @@ struct uffdio_zeropage) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) /* read() structure */ struct uffd_msg { @@ -268,6 +273,18 @@ struct uffdio_writeprotect { __u64 mode; }; +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + /* * Flags for the userfaultfd(2) system call itself. */ -- cgit v1.2.3 From b6676de8d7b48724d4cd3a3742c62fa525baa904 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:01 -0700 Subject: mm/vmscan: move RECLAIM* bits to uapi header It is currently not obvious that the RECLAIM_* bits are part of the uapi since they are defined in vmscan.c. Move them to a uapi header to make it obvious. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172557.08074910@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Alex Shi Cc: Daniel Wagner Cc: "Tobin C. Harding" Cc: Christoph Lameter Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8948467b3992..4832fd0b5642 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -64,5 +64,12 @@ enum { #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ -- cgit v1.2.3 From 202e35db5e719ee8af6028183403f475e243f82d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:04 -0700 Subject: mm/vmscan: replace implicit RECLAIM_ZONE checks with explicit checks RECLAIM_ZONE was assumed to be unused because it was never explicitly used in the kernel. However, there were a number of places where it was checked implicitly by checking 'node_reclaim_mode' for a zero value. These zero checks are not great because it is not obvious what a zero mode *means* in the code. Replace them with a helper which makes it more obvious: node_reclaim_enabled(). This helper also provides a handy place to explicitly check the RECLAIM_ZONE bit itself. Check it explicitly there to make it more obvious where the bit can affect behavior. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172559.BF589C44@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Alex Shi Cc: "Tobin C. Harding" Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Cc: Daniel Wagner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4cc6ec3bf0ab..42191da1bdc9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -12,6 +12,7 @@ #include #include #include +#include #include struct notifier_block; @@ -378,6 +379,12 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + extern void check_move_unevictable_pages(struct pagevec *pvec); extern int kswapd_run(int nid); -- cgit v1.2.3 From 2bfd36374edd9ed7f2ebf66cacebedf7273901cb Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:11 -0700 Subject: mm: vmscan: consolidate shrinker_maps handling code The shrinker map management is not purely memcg specific, it is at the intersection between memory cgroup and shrinkers. It's allocation and assignment of a structure, and the only memcg bit is the map is being stored in a memcg structure. So move the shrinker_maps handling code into vmscan.c for tighter integration with shrinker code, and remove the "memcg_" prefix. There is no functional change. Link: https://lkml.kernel.org/r/20210311190845.9708-3-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5904716f29ba..7dbd2c9bad32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1610,10 +1610,9 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); +int alloc_shrinker_maps(struct mem_cgroup *memcg); +void free_shrinker_maps(struct mem_cgroup *memcg); +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1623,8 +1622,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) +static inline void set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif -- cgit v1.2.3 From e4262c4f51d6373447c9d89093f49ff6b1e607be Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:23 -0700 Subject: mm: memcontrol: rename shrinker_map to shrinker_info The following patch is going to add nr_deferred into shrinker_map, the change will make shrinker_map not only include map anymore, so rename it to "memcg_shrinker_info". And this should make the patch adding nr_deferred cleaner and readable and make review easier. Also remove the "memcg_" prefix. Link: https://lkml.kernel.org/r/20210311190845.9708-7-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7dbd2c9bad32..6cd800fe9a67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -117,7 +117,7 @@ struct batched_lruvec_stat { * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. */ -struct memcg_shrinker_map { +struct shrinker_info { struct rcu_head rcu; unsigned long map[]; }; @@ -145,7 +145,7 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter; - struct memcg_shrinker_map __rcu *shrinker_map; + struct shrinker_info __rcu *shrinker_info; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ @@ -1610,8 +1610,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -int alloc_shrinker_maps(struct mem_cgroup *memcg); -void free_shrinker_maps(struct mem_cgroup *memcg); +int alloc_shrinker_info(struct mem_cgroup *memcg); +void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 -- cgit v1.2.3 From 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:29 -0700 Subject: mm: vmscan: use a new flag to indicate shrinker is registered Currently registered shrinker is indicated by non-NULL shrinker->nr_deferred. This approach is fine with nr_deferred at the shrinker level, but the following patches will move MEMCG_AWARE shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred would always be NULL. This would prevent the shrinkers from unregistering correctly. Remove SHRINKER_REGISTERING since we could check if shrinker is registered successfully by the new flag. Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Kirill Tkhai Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e2..1eac79ce57d4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); -- cgit v1.2.3 From 3c6f17e6c5d048c8029578c475dd037dd5db58af Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:33 -0700 Subject: mm: vmscan: add per memcg shrinker nr_deferred Currently the number of deferred objects are per shrinker, but some slabs, for example, vfs inode/dentry cache are per memcg, this would result in poor isolation among memcgs. The deferred objects typically are generated by __GFP_NOFS allocations, one memcg with excessive __GFP_NOFS allocations may blow up deferred objects, then other innocent memcgs may suffer from over shrink, excessive reclaim latency, etc. For example, two workloads run in memcgA and memcgB respectively, workload in B is vfs heavy workload. Workload in A generates excessive deferred objects, then B's vfs cache might be hit heavily (drop half of caches) by B's limit reclaim or global reclaim. We observed this hit in our production environment which was running vfs heavy workload shown as the below tracing log: <...>-409454 [016] .... 28286961.747146: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 objects to shrink 3641681686040 gfp_flags GFP_HIGHUSER_MOVABLE|__GFP_ZERO pgs_scanned 1 lru_pgs 15721 cache items 246404277 delta 31345 total_scan 123202138 <...>-409454 [022] .... 28287105.928018: mm_shrink_slab_end: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 unused scan count 3641681686040 new scan count 3641798379189 total_scan 602 last shrinker return val 123186855 The vfs cache and page cache ratio was 10:1 on this machine, and half of caches were dropped. This also resulted in significant amount of page caches were dropped due to inodes eviction. Make nr_deferred per memcg for memcg aware shrinkers would solve the unfairness and bring better isolation. The following patch will add nr_deferred to parent memcg when memcg offline. To preserve nr_deferred when reparenting memcgs to root, root memcg needs shrinker_info allocated too. When memcg is not enabled (!CONFIG_MEMCG or memcg disabled), the shrinker's nr_deferred would be used. And non memcg aware shrinkers use shrinker's nr_deferred all the time. Link: https://lkml.kernel.org/r/20210311190845.9708-10-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6cd800fe9a67..32bd62047238 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -114,12 +114,13 @@ struct batched_lruvec_stat { }; /* - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, - * which have elements charged to this memcg. + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to this memcg. */ struct shrinker_info { struct rcu_head rcu; - unsigned long map[]; + atomic_long_t *nr_deferred; + unsigned long *map; }; /* -- cgit v1.2.3 From a178015cde69981cdcd8f109c5abc98703fead62 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:42 -0700 Subject: mm: memcontrol: reparent nr_deferred when memcg offline Now shrinker's nr_deferred is per memcg for memcg aware shrinkers, add to parent's corresponding nr_deferred when memcg offline. Link: https://lkml.kernel.org/r/20210311190845.9708-13-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 32bd62047238..c193be760709 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1614,6 +1614,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); +void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; -- cgit v1.2.3 From ef4984384172e93cc95e0e8cd102536d67e8a787 Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Tue, 4 May 2021 18:36:48 -0700 Subject: mm/compaction: remove unused variable sysctl_compact_memory The sysctl_compact_memory is mostly unused in mm/compaction.c It just acts as a place holder for sysctl to store .data. But the .data itself is not needed here. So we can get ride of this variable completely and make .data as NULL. This will also eliminate the extern declaration from header file. No functionality is broken or changed this way. Link: https://lkml.kernel.org/r/1614852224-14671-1-git-send-email-pintu@codeaurora.org Signed-off-by: Pintu Kumar Signed-off-by: Pintu Agarwal Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ed4070ed41ef..4221888bdcd6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order) } #ifdef CONFIG_COMPACTION -extern int sysctl_compact_memory; extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); -- cgit v1.2.3 From d479960e44f27e0e52ba31b21740b703c538027c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:54 -0700 Subject: mm: disable LRU pagevec during the migration temporarily LRU pagevec holds refcount of pages until the pagevec are drained. It could prevent migration since the refcount of the page is greater than the expection in migration logic. To mitigate the issue, callers of migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all before migrate_pages call. However, it's not enough because pages coming into pagevec after the draining call still could stay at the pagevec so it could keep preventing page migration. Since some callers of migrate_pages have retrial logic with LRU draining, the page would migrate at next trail but it is still fragile in that it doesn't close the fundamental race between upcoming LRU pages into pagvec and migration so the migration failure could cause contiguous memory allocation failure in the end. To close the race, this patch disables lru caches(i.e, pagevec) during ongoing migration until migrate is done. Since it's really hard to reproduce, I measured how many times migrate_pages retried with force mode(it is about a fallback to a sync migration) with below debug code. int migrate_pages(struct list_head *from, new_page_t get_new_page, .. .. if (rc && reason == MR_CONTIG_RANGE && pass > 2) { printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc); dump_page(page, "fail to migrate"); } The test was repeating android apps launching with cma allocation in background every five seconds. Total cma allocation count was about 500 during the testing. With this patch, the dump_page count was reduced from 400 to 30. The new interface is also useful for memory hotplug which currently drains lru pcp caches after each migration failure. This is rather suboptimal as it has to disrupt others running during the operation. With the new interface the operation happens only once. This is also in line with pcp allocator cache which are disabled for the offlining as well. Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Chris Goldsworthy Acked-by: Michal Hocko Cc: John Dias Cc: Suren Baghdasaryan Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Oliver Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ include/linux/swap.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3a389633b68f..9e4a2dc8622c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -46,6 +46,7 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); extern void migrate_prep(void); +extern void migrate_finish(void); extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); @@ -67,6 +68,7 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } static inline int migrate_prep(void) { return -ENOSYS; } +static inline int migrate_finish(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } static inline void migrate_page_states(struct page *newpage, struct page *page) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42191da1bdc9..f69e0f67651d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file, extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void mark_page_accessed(struct page *); + +extern atomic_t lru_disable_count; + +static inline bool lru_cache_disabled(void) +{ + return atomic_read(&lru_disable_count); +} + +static inline void lru_cache_enable(void) +{ + atomic_dec(&lru_disable_count); +} + +extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); -- cgit v1.2.3 From 361a2a229fa31ab7f2b236b5946e434964d00762 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:57 -0700 Subject: mm: replace migrate_[prep|finish] with lru_cache_[disable|enable] Currently, migrate_[prep|finish] is merely a wrapper of lru_cache_[disable|enable]. There is not much to gain from having additional abstraction. Use lru_cache_[disable|enable] instead of migrate_[prep|finish], which would be more descriptive. note: migrate_prep_local in compaction.c changed into lru_add_drain to avoid CPU schedule cost with involving many other CPUs to keep old behavior. Link: https://lkml.kernel.org/r/20210319175127.886124-2-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Chris Goldsworthy Cc: John Dias Cc: Matthew Wilcox Cc: Oliver Sang Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9e4a2dc8622c..6155d97ec76c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -45,9 +45,6 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); -extern void migrate_prep(void); -extern void migrate_finish(void); -extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -67,10 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page, static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } -static inline int migrate_prep(void) { return -ENOSYS; } -static inline int migrate_finish(void) { return -ENOSYS; } -static inline int migrate_prep_local(void) { return -ENOSYS; } - static inline void migrate_page_states(struct page *newpage, struct page *page) { } -- cgit v1.2.3 From 8cc621d2f45ddd3dc664024a647ee7adf48d79a5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:00 -0700 Subject: mm: fs: invalidate BH LRU during page migration Pages containing buffer_heads that are in one of the per-CPU buffer_head LRU caches will be pinned and thus cannot be migrated. This can prevent CMA allocations from succeeding, which are often used on platforms with co-processors (such as a DSP) that can only use physically contiguous memory. It can also prevent memory hot-unplugging from succeeding, which involves migrating at least MIN_MEMORY_BLOCK_SIZE bytes of memory, which ranges from 8 MiB to 1 GiB based on the architecture in use. Correspondingly, invalidate the BH LRU caches before a migration starts and stop any buffer_head from being cached in the LRU caches, until migration has finished. Link: https://lkml.kernel.org/r/20210319175127.886124-3-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: Chris Goldsworthy Reported-by: Laura Abbott Tested-by: Oliver Sang Cc: David Hildenbrand Cc: John Dias Cc: Matthew Wilcox Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6b47f94378c5..e7e99da31349 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(int cpu); +bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 606a6f71a25accfc960a5063c23717ff07aa43a3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:04 -0700 Subject: mm/migrate.c: make putback_movable_page() static Patch series "Cleanup and fixup for mm/migrate.c", v3. This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and rc != MIGRATEPAGE_SUCCESS check. Also use helper function to remove some duplicated codes. What's more, this fixes potential deadlock in NUMA balancing shared exec THP case and so on. More details can be found in the respective changelogs. This patch (of 5): The putback_movable_page() is just called by putback_movable_pages() and we know the page is locked and both PageMovable() and PageIsolated() is checked right before calling putback_movable_page(). So we make it static and remove all the 3 VM_BUG_ON_PAGE(). Link: https://lkml.kernel.org/r/20210325131524.48181-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210325131524.48181-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Jerome Glisse Cc: Rafael Aquini Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6155d97ec76c..175ef15ae9e8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -43,7 +43,6 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -extern void putback_movable_page(struct page *page); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); -- cgit v1.2.3 From bbb269206f3c914d4f23e023de4ec020abea6d1b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:19 -0700 Subject: mm: vmstat: add cma statistics Since CMA is used more widely, it's worth to have CMA allocation statistics into vmstat. With it, we could know how agressively system uses cma allocation and how often it fails. Link: https://lkml.kernel.org/r/20210302183346.3707237-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Cc: John Dias Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..21d7c7f72f1c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, +#endif +#ifdef CONFIG_CMA + CMA_ALLOC_SUCCESS, + CMA_ALLOC_FAIL, #endif UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ -- cgit v1.2.3 From 7bc1aec5e28765ad18742824b3b972471807a632 Mon Sep 17 00:00:00 2001 From: Liam Mark Date: Tue, 4 May 2021 18:37:25 -0700 Subject: mm: cma: add trace events for CMA alloc perf testing Add cma and migrate trace events to enable CMA allocation performance to be measured via ftrace. [georgi.djakov@linaro.org: add the CMA instance name to the cma_alloc_start trace event] Link: https://lkml.kernel.org/r/20210326155414.25006-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210324160740.15901-1-georgi.djakov@linaro.org Signed-off-by: Liam Mark Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 42 +++++++++++++++++++++++++++++++++++++++++- include/trace/events/migrate.h | 22 ++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5017a8829270..be1525a10457 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -8,7 +8,7 @@ #include #include -TRACE_EVENT(cma_alloc, +DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), @@ -61,6 +61,46 @@ TRACE_EVENT(cma_release, __entry->count) ); +TRACE_EVENT(cma_alloc_start, + + TP_PROTO(const char *name, unsigned int count, unsigned int align), + + TP_ARGS(name, count, align), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned int, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->count = count; + __entry->align = align; + ), + + TP_printk("name=%s count=%u align=%u", + __get_str(name), + __entry->count, + __entry->align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4d434398d64d..f2c990603888 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -81,6 +81,28 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); + +TRACE_EVENT(mm_migrate_pages_start, + + TP_PROTO(enum migrate_mode mode, int reason), + + TP_ARGS(mode, reason), + + TP_STRUCT__entry( + __field(enum migrate_mode, mode) + __field(int, reason) + ), + + TP_fast_assign( + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("mode=%s reason=%s", + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 3aab8ae7aace3388da319a233edf48f0f5d26a44 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:31 -0700 Subject: mm: cma: add the CMA instance name to cma trace events There were missing places to add cma instance name. To identify each CMA instance, let's add the name for every cma trace. This patch also changes the existing cma_trace_alloc to cma_trace_finish since we have cma_alloc_start[1]. [1] https://lore.kernel.org/linux-mm/20210324160740.15901-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210330220237.748899-1-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Liam Mark Cc: Georgi Djakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index be1525a10457..5cf385ae7c08 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -10,12 +10,13 @@ DECLARE_EVENT_CLASS(cma_alloc_class, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align), + TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) @@ -23,13 +24,15 @@ DECLARE_EVENT_CLASS(cma_alloc_class, ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), - TP_printk("pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count, @@ -38,24 +41,27 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count), - TP_ARGS(pfn, page, count), + TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), - TP_printk("pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count) @@ -85,20 +91,20 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc, +DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); #endif /* _TRACE_CMA_H */ -- cgit v1.2.3 From 78fa51503fdbe463c96eef4c3cf69ca54032647a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:34 -0700 Subject: mm: use proper type for cma_[alloc|release] size_t in cma_alloc is confusing since it makes people think it's byte count, not pages. Change it to unsigned long[1]. The unsigned int in cma_release is also not right so change it. Since we have unsigned long in cma_release, free_contig_range should also respect it. [1] 67a2e213e7e9, mm: cma: fix incorrect type conversion for size during dma allocation Link: https://lore.kernel.org/linux-mm/20210324043434.GP1719932@casper.infradead.org/ Link: https://lkml.kernel.org/r/20210331164018.710560-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 4 ++-- include/linux/gfp.h | 2 +- include/trace/events/cma.h | 22 +++++++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 217999c8a762..53fd8c3cdbd0 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, +extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 26f4d907254a..8a5f6c3d7dba 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end, extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif -void free_contig_range(unsigned long pfn, unsigned int nr_pages); +void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5cf385ae7c08..c3d354702cb0 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -11,7 +11,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align), @@ -19,7 +19,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __entry->align = align; ), - TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, @@ -42,7 +42,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count), + unsigned long count), TP_ARGS(name, pfn, page, count), @@ -50,7 +50,7 @@ TRACE_EVENT(cma_release, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) ), TP_fast_assign( @@ -60,7 +60,7 @@ TRACE_EVENT(cma_release, __entry->count = count; ), - TP_printk("name=%s pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, @@ -69,13 +69,13 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned int count, unsigned int align), + TP_PROTO(const char *name, unsigned long count, unsigned int align), TP_ARGS(name, count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -85,7 +85,7 @@ TRACE_EVENT(cma_alloc_start, __entry->align = align; ), - TP_printk("name=%s count=%u align=%u", + TP_printk("name=%s count=%lu align=%u", __get_str(name), __entry->count, __entry->align) @@ -94,7 +94,7 @@ TRACE_EVENT(cma_alloc_start, DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); @@ -102,7 +102,7 @@ DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); -- cgit v1.2.3 From 575299ea18a8c0575d4c2ef6ad3fa4d41d529d1c Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Tue, 4 May 2021 18:38:03 -0700 Subject: x86/mm: track linear mapping split events To help with debugging the sluggishness caused by TLB miss/reload, we introduce monotonic hugepage [direct mapped] split event counts since system state: SYSTEM_RUNNING to be displayed as part of /proc/vmstat in x86 servers The lifetime split event information will be displayed at the bottom of /proc/vmstat .... swap_ra 0 swap_ra_hit 0 direct_map_level2_splits 94 direct_map_level3_splits 4 nr_unstable 0 .... One of the many lasting sources of direct hugepage splits is kernel tracing (kprobes, tracepoints). Note that the kernel's code segment [512 MB] points to the same physical addresses that have been already mapped in the kernel's direct mapping range. Source : Documentation/x86/x86_64/mm.rst When we enable kernel tracing, the kernel has to modify attributes/permissions of the text segment hugepages that are direct mapped causing them to split. Kernel's direct mapped hugepages do not coalesce back after split and remain in place for the remainder of the lifetime. An instance of direct page splits when we turn on dynamic kernel tracing .... cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 784 direct_map_level3_splits 12 bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] = count(); }' cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 789 direct_map_level3_splits 12 .... Link: https://lkml.kernel.org/r/20210218235744.1040634-1-saravanand@fb.com Signed-off-by: Saravanan D Acked-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Dave Hansen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 21d7c7f72f1c..ae0dd1948c2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -124,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, #endif NR_VM_EVENT_ITEMS }; -- cgit v1.2.3 From 1a08ae36cf8b5f26d0c64ebfe46f8eb07ea0b678 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:53 -0700 Subject: mm cma: rename PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN PF_MEMALLOC_NOCMA is used ot guarantee that the allocator will not return pages that might belong to CMA region. This is currently used for long term gup to make sure that such pins are not going to be done on any CMA pages. When PF_MEMALLOC_NOCMA has been introduced we haven't realized that it is focusing on CMA pages too much and that there is larger class of pages that need the same treatment. MOVABLE zone cannot contain any long term pins as well so it makes sense to reuse and redefine this flag for that usecase as well. Rename the flag to PF_MEMALLOC_PIN which defines an allocation context which can only get pages suitable for long-term pins. Also rename: memalloc_nocma_save()/memalloc_nocma_restore to memalloc_pin_save()/memalloc_pin_restore() and make the new functions common. [rppt@linux.ibm.com: fix renaming of PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN] Link: https://lkml.kernel.org/r/20210331163816.11517-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210215161349.246722-6-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Acked-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- include/linux/sched/mm.h | 21 +++++---------------- 2 files changed, 6 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c25c8e67030..d2c881384517 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1583,7 +1583,7 @@ extern struct pid *cad_pid; #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 90b2a0bce11c..ae654819e8aa 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -271,29 +271,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } -#ifdef CONFIG_CMA -static inline unsigned int memalloc_nocma_save(void) +static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + unsigned int flags = current->flags & PF_MEMALLOC_PIN; - current->flags |= PF_MEMALLOC_NOCMA; + current->flags |= PF_MEMALLOC_PIN; return flags; } -static inline void memalloc_nocma_restore(unsigned int flags) +static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; + current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; } -#else -static inline unsigned int memalloc_nocma_save(void) -{ - return 0; -} - -static inline void memalloc_nocma_restore(unsigned int flags) -{ -} -#endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); -- cgit v1.2.3 From 8e3560d963d22ba41857f48e4114ce80373144ea Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:00 -0700 Subject: mm: honor PF_MEMALLOC_PIN for all movable pages PF_MEMALLOC_PIN is only honored for CMA pages, extend this flag to work for any allocations from ZONE_MOVABLE by removing __GFP_MOVABLE from gfp_mask when this flag is passed in the current context. Add is_pinnable_page() to return true if page is in a pinnable page. A pinnable page is not in ZONE_MOVABLE and not of MIGRATE_CMA type. Link: https://lkml.kernel.org/r/20210215161349.246722-8-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 ++++++++++++++++++ include/linux/sched/mm.h | 6 +++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dbb53c44243..d0e628f511e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1141,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} + #ifdef CONFIG_DEV_PAGEMAP_OPS void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); @@ -1550,6 +1555,19 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +#ifdef CONFIG_MIGRATION +static inline bool is_pinnable_page(struct page *page) +{ + return !is_zone_movable_page(page) && !is_migrate_cma_page(page); +} +#else +static inline bool is_pinnable_page(struct page *page) +{ + return true; +} +#endif + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index ae654819e8aa..e24b1fe348e3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; } return flags; } -- cgit v1.2.3 From 9afaf30f7a1aab2022961715a66f644275b8daec Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:04 -0700 Subject: mm/gup: do not migrate zero page On some platforms ZERO_PAGE(0) might end-up in a movable zone. Do not migrate zero page in gup during longterm pinning as migration of zero page is not allowed. For example, in x86 QEMU with 16G of memory and kernelcore=5G parameter, I see the following: Boot#1: zero_pfn 0x48a8d zero_pfn zone: ZONE_DMA32 Boot#2: zero_pfn 0x20168d zero_pfn zone: ZONE_MOVABLE On x86, empty_zero_page is declared in .bss and depending on the loader may end up in different physical locations during boots. Also, move is_zero_pfn() my_zero_pfn() functions under CONFIG_MMU, because zero_pfn that they are using is declared in memory.c which is compiled with CONFIG_MMU. Link: https://lkml.kernel.org/r/20210215161349.246722-9-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 4 ++++ include/linux/pgtable.h | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d0e628f511e4..76e27ebb28a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1559,7 +1559,8 @@ static inline unsigned long page_to_section(const struct page *page) #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !is_zone_movable_page(page) && !is_migrate_cma_page(page); + return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || + is_zero_pfn(page_to_pfn(page)); } #else static inline bool is_pinnable_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3b2205741048..92b44149d5b9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -427,6 +427,10 @@ enum zone_type { * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). + * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create + * situations where ZERO_PAGE(0) which is allocated differently + * on different platforms may end up in a movable zone. ZERO_PAGE(0) + * cannot be migrated. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772392a379..2194a9cd885c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif +#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr) return zero_pfn; } #endif +#else +static inline int is_zero_pfn(unsigned long pfn) +{ + return 0; +} + +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_MMU -- cgit v1.2.3 From d1e153fea2a8940273174fc17733c44323d35cd5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:08 -0700 Subject: mm/gup: migrate pinned pages out of movable zone We should not pin pages in ZONE_MOVABLE. Currently, we do not pin only movable CMA pages. Generalize the function that migrates CMA pages to migrate all movable pages. Use is_pinnable_page() to check which pages need to be migrated Link: https://lkml.kernel.org/r/20210215161349.246722-10-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 + include/linux/mmzone.h | 9 +++++++-- include/trace/events/migrate.h | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 175ef15ae9e8..4bb4e519e3f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -27,6 +27,7 @@ enum migrate_reason { MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, MR_CONTIG_RANGE, + MR_LONGTERM_PIN, MR_TYPES }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 92b44149d5b9..e8922a67d1a4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -407,8 +407,13 @@ enum zone_type { * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might - * essentially turn such pages unmovable. Memory offlining might - * retry a long time. + * essentially turn such pages unmovable. Therefore, we do not allow + * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and + * faulted, they come from the right zone right away. However, it is + * still possible that address space already has pages in + * ZONE_MOVABLE at the time when pages are pinned (i.e. user has + * touches that memory before pinning). In such case we migrate them + * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index f2c990603888..9fb2a3bbcdfb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -20,7 +20,8 @@ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ - EMe(MR_CONTIG_RANGE, "contig_range") + EM( MR_CONTIG_RANGE, "contig_range") \ + EMe(MR_LONGTERM_PIN, "longterm_pin") /* * First define the enums in the above macros to be exported to userspace -- cgit v1.2.3 From a08a2ae3461383c2d50d0997dcc6cd1dd1fefb08 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:42 -0700 Subject: mm,memory_hotplug: allocate memmap from the added memory range Physical memory hotadd has to allocate a memmap (struct page array) for the newly added memory section. Currently, alloc_pages_node() is used for those allocations. This has some disadvantages: a) an existing memory is consumed for that purpose (eg: ~2MB per 128MB memory section on x86_64) This can even lead to extreme cases where system goes OOM because the physically hotplugged memory depletes the available memory before it is onlined. b) if the whole node is movable then we have off-node struct pages which has performance drawbacks. c) It might be there are no PMD_ALIGNED chunks so memmap array gets populated with base pages. This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled. Vmemap page tables can map arbitrary memory. That means that we can reserve a part of the physically hotadded memory to back vmemmap page tables. This implementation uses the beginning of the hotplugged memory for that purpose. There are some non-obviously things to consider though. Vmemmap pages are allocated/freed during the memory hotplug events (add_memory_resource(), try_remove_memory()) when the memory is added/removed. This means that the reserved physical range is not online although it is used. The most obvious side effect is that pfn_to_online_page() returns NULL for those pfns. The current design expects that this should be OK as the hotplugged memory is considered a garbage until it is onlined. For example hibernation wouldn't save the content of those vmmemmaps into the image so it wouldn't be restored on resume but this should be OK as there no real content to recover anyway while metadata is reachable from other data structures (e.g. vmemmap page tables). The reserved space is therefore (de)initialized during the {on,off}line events (mhp_{de}init_memmap_on_memory). That is done by extracting page allocator independent initialization from the regular onlining path. The primary reason to handle the reserved space outside of {on,off}line_pages is to make each initialization specific to the purpose rather than special case them in a single function. As per above, the functions that are introduced are: - mhp_init_memmap_on_memory: Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages fully span. - mhp_deinit_memmap_on_memory: Offlines as many sections as vmemmap pages fully span, removes the range from zhe zone by remove_pfn_range_from_zone(), and calls kasan_remove_zero_shadow() for the range. The new function memory_block_online() calls mhp_init_memmap_on_memory() before doing the actual online_pages(). Should online_pages() fail, we clean up by calling mhp_deinit_memmap_on_memory(). Adjusting of present_pages is done at the end once we know that online_pages() succedeed. On offline, memory_block_offline() needs to unaccount vmemmap pages from present_pages() before calling offline_pages(). This is necessary because offline_pages() tears down some structures based on the fact whether the node or the zone become empty. If offline_pages() fails, we account back vmemmap pages. If it succeeds, we call mhp_deinit_memmap_on_memory(). Hot-remove: We need to be careful when removing memory, as adding and removing memory needs to be done with the same granularity. To check that this assumption is not violated, we check the memory range we want to remove and if a) any memory block has vmemmap pages and b) the range spans more than a single memory block, we scream out loud and refuse to proceed. If all is good and the range was using memmap on memory (aka vmemmap pages), we construct an altmap structure so free_hugepage_table does the right thing and calls vmem_altmap_free instead of free_pagetable. Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 8 +++++++- include/linux/memory_hotplug.h | 15 ++++++++++++++- include/linux/memremap.h | 2 +- include/linux/mmzone.h | 7 +++++-- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 4da95e684e20..97e92e8b556a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,6 +29,11 @@ struct memory_block { int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ struct device dev; + /* + * Number of vmemmap pages. These pages + * lay at the beginning of the memory block. + */ + unsigned long nr_vmemmap_pages; }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -int create_memory_block_devices(unsigned long start, unsigned long size); +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7288aa5ef73b..28f32fd00fe9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -55,6 +55,14 @@ typedef int __bitwise mhp_t; */ #define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +/* + * We want memmap (struct page array) to be self contained. + * To do so, we will use the beginning of the hot-added range to build + * the page tables for the memmap array that describes the entire range. + * Only selected architectures support it with SPARSE_VMEMMAP. + */ +#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +extern void adjust_present_page_count(struct zone *zone, long nr_pages); /* VM interface that may be used by firmware interface */ +extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone); +extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid); + struct zone *zone); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); +extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f5b464daeeca..45a79da89c5f 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,7 +17,7 @@ struct device; * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { - const unsigned long base_pfn; + unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e8922a67d1a4..917bd6c604d5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -436,6 +436,11 @@ enum zone_type { * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. + * 7. Memory-hotplug: when using memmap_on_memory and onlining the + * memory to the MOVABLE zone, the vmemmap pages are also placed in + * such zone. Such pages cannot be really moved around as they are + * self-stored in the range, but they are treated as movable when + * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) @@ -1392,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr) #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); -#ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif -#endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { -- cgit v1.2.3 From 28961998f858114e51d2ae862065b858afcfa2b2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 4 May 2021 18:40:03 -0700 Subject: iov_iter: lift memzero_page() to highmem.h Patch series "btrfs: Convert kmap/memset/kunmap to memzero_user()". Lifting memzero_user(), convert it to kmap_local_page() and then use it in btrfs. This patch (of 3): memzero_page() can replace the kmap/memset/kunmap pattern in other places in the code. While zero_user() has the same interface it is not the same call and its use should be limited and some of those calls may be better converted from zero_user() to memzero_page().[1] But that is not addressed in this series. Lift memzero_page() to highmem. [1] https://lore.kernel.org/lkml/CAHk-=wijdojzo56FzYqE5TOYw2Vws7ik3LEMGj9SPQaJJ+Z73Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210309212137.2610186-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20210309212137.2610186-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Alexander Viro Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Chaitanya Kulkarni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44170f312ae7..832b49b50c7b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset, kunmap_local(to); } +static inline void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + #endif /* _LINUX_HIGHMEM_H */ -- cgit v1.2.3