From fcb72a585aaa4caced555e98f8444e6162912cb7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: perf: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220906194824.2110408-48-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/events/core.c | 3 ++- kernel/events/uprobes.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..101c5912c3fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10229,8 +10229,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2eaa327f8158..401bc2d24ce0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -349,9 +349,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -1231,11 +1232,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1983,9 +1985,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* -- cgit v1.2.3 From 5fcd079af9ed4e69cca0a2f77c6255d0eb8a8cca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:40 +0100 Subject: uprobes: use folios more widely in __replace_page() Remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-45-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 401bc2d24ce0..70375c7c0c4b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ -#include /* try_to_free_swap */ +#include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,8 +154,9 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -169,8 +170,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -186,7 +187,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -198,15 +199,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, vma, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } -- cgit v1.2.3 From 82e66bf76173a1525db9866455a7fdbc07b57297 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:52 +0100 Subject: uprobes: use new_folio in __replace_page() Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-57-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70375c7c0c4b..e0a9b945e7bc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,6 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; @@ -164,8 +165,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -180,9 +181,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); + folio_get(new_folio); page_add_new_anon_rmap(new_page, vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); -- cgit v1.2.3 From 34488399fa08faaf664743fa54b271eb6f9e1321 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:39 -0700 Subject: mm/madvise: add file and shmem support to MADV_COLLAPSE Add support for MADV_COLLAPSE to collapse shmem-backed and file-backed memory into THPs (requires CONFIG_READ_ONLY_THP_FOR_FS=y). On success, the backing memory will be a hugepage. For the memory range and process provided, the page tables will synchronously have a huge pmd installed, mapping the THP. Other mappings of the file extent mapped by the memory range may be added to a set of entries that khugepaged will later process and attempt update their page tables to map the THP by a pmd. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. Since khugepaged is single threaded, this change now introduces possibility of collapse contexts racing in file collapse path. There a important few places to consider: (1) hpage_collapse_scan_file(), when we xas_pause() and drop RCU. We could have the memory collapsed out from under us, but the next xas_for_each() iteration will correctly pick up the hugepage. The hugepage might not be up to date (insofar as copying of small page contents might not have completed - the page still may be locked), but regardless what small page index we were iterating over, we'll find the hugepage and identify it as a suitably aligned compound page of order HPAGE_PMD_ORDER. In khugepaged path, we locklessly check the value of the pmd, and only add it to deferred collapse array if we find pmd mapping pte table. This is fine, since other values that could have raced in right afterwards denote failure, or that the memory was successfully collapsed, so we don't need further processing. In madvise path, we'll take mmap_lock() in write to serialize against page table updates and will know what to do based on the true value of the pmd: recheck all ptes if we point to a pte table, directly install the pmd, if the pmd has been cleared, but memory not yet faulted, or nothing at all if we find a huge pmd. It's worth putting emphasis here on how we treat the none pmd here. If khugepaged has processed this mm's page tables already, it will have left the pmd cleared (ready for refault by the process). Depending on the VMA flags and sysfs settings, amount of RAM on the machine, and the current load, could be a relatively common occurrence - and as such is one we'd like to handle successfully in MADV_COLLAPSE. When we see the none pmd in collapse_pte_mapped_thp(), we've locked mmap_lock in write and checked (a) huepaged_vma_check() to see if the backing memory is appropriate still, along with VMA sizing and appropriate hugepage alignment within the file, and (b) we've found a hugepage head of order HPAGE_PMD_ORDER at the offset in the file mapped by our hugepage-aligned virtual address. Even though the common-case is likely race with khugepaged, given these checks (regardless how we got here - we could be operating on a completely different file than originally checked in hpage_collapse_scan_file() for all we know) it should be safe to directly make the pmd a huge pmd pointing to this hugepage. (2) collapse_file() is mostly serialized on the same file extent by lock sequence: | lock hupepage | lock mapping->i_pages | lock 1st page | unlock mapping->i_pages | | lock mapping->i_pages | page_ref_freeze(3) | xas_store(hugepage) | unlock mapping->i_pages | page_ref_unfreeze(1) | unlock 1st page V unlock hugepage Once a context (who already has their fresh hugepage locked) locks mapping->i_pages exclusively, it will hold said lock until it locks the first page, and it will hold that lock until the after the hugepage has been added to the page cache (and will unlock the hugepage after page table update, though that isn't important here). A racing context that loses the race for mapping->i_pages will then lose the race to locking the first page. Here - depending on how far the other racing context has gotten - we might find the new hugepage (in which case we'll exit cleanly when we check PageTransCompound()), or we'll find the "old" 1st small page (in which we'll exit cleanly when we discover unexpected refcount of 2 after isolate_lru_page()). This is assuming we are able to successfully lock the page we find - in shmem path, we could just fail the trylock and exit cleanly anyways. Failure path in collapse_file() is similar: once we hold lock on 1st small page, we are serialized against other collapse contexts. Before the 1st small page is unlocked, we add it back to the pagecache and unfreeze the refcount appropriately. Contexts who lost the race to the 1st small page will then find the same 1st small page with the correct refcount and will be able to proceed. [zokeefe@google.com: don't check pmd value twice in collapse_pte_mapped_thp()] Link: https://lkml.kernel.org/r/20220927033854.477018-1-zokeefe@google.com [shy828301@gmail.com: Delete hugepage_vma_revalidate_anon(), remove check for multi-add in khugepaged_add_pte_mapped_thp()] Link: https://lore.kernel.org/linux-mm/CAHbLzkrtpM=ic7cYAHcqkubah5VTR8N5=k5RT8MTvv5rN1Y91w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-4-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-4-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e0a9b945e7bc..d9e357b7e17c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -555,7 +555,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } -- cgit v1.2.3