From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 19:05:56 +0200 Subject: mm: remove remnants of SPLIT_RSS_COUNTING The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), but the patch failed to fully clean it up. Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Acked-by: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 6c264d2f969c..0739ccb00e61 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -471,8 +471,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; - if (current->mm == mm) - sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); -- cgit v1.2.3 From 73eab3ca481e5be0f1fd8140365d604482f84ee1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:27 +0800 Subject: mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio() At present, numa balance only support base page and PMD-mapped THP, but we will expand to support to migrate large folio/pte-mapped THP in the future, it is better to make migrate_misplaced_page() to take a folio instead of a page, and rename it to migrate_misplaced_folio(), it is a preparation, also this remove several compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0739ccb00e61..d956b231e835 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4812,7 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) writable = false; /* Migrate to the requested node */ - if (migrate_misplaced_page(page, vma, target_nid)) { + if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; } else { -- cgit v1.2.3 From 65610453459f9048678a0daef89d592e412ec00a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:12 +0800 Subject: mm: memory: add vm_normal_folio_pmd() Patch series "mm: convert numa balancing functions to use a folio", v2. do_numa_pages() only handles non-compound pages, and only PMD-mapped THPs are handled in do_huge_pmd_numa_page(). But a large, PTE-mapped folio will be supported so let's convert more numa balancing functions to use/take a folio in preparation for that, no functional change intended for now. This patch (of 6): The new vm_normal_folio_pmd() wrapper is similar to vm_normal_folio(), which allow them to completely replace the struct page variables with struct folio variables. Link: https://lkml.kernel.org/r/20230921074417.24004-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230921074417.24004-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index d956b231e835..311e862c6404 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -689,6 +689,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, out: return pfn_to_page(pfn); } + +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page = vm_normal_page_pmd(vma, addr, pmd); + + if (page) + return page_folio(page); + return NULL; +} #endif static void restore_exclusive_pte(struct vm_area_struct *vma, -- cgit v1.2.3 From 6695cf68b15c215d33b8add64c33e01e3cbe236c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:14 +0800 Subject: mm: memory: use a folio in do_numa_page() Numa balancing only try to migrate non-compound page in do_numa_page(), use a folio in it to save several compound_head calls, note we use folio_estimated_sharers(), it is enough to check the folio sharers since only normal page is handled, if large folio numa balancing is supported, a precise folio sharers check would be used, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 311e862c6404..865741d9b6b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4744,8 +4744,8 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL; - int page_nid = NUMA_NO_NODE; + struct folio *folio = NULL; + int nid = NUMA_NO_NODE; bool writable = false; int last_cpupid; int target_nid; @@ -4776,12 +4776,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) can_change_pte_writable(vma, vmf->address, pte)) writable = true; - page = vm_normal_page(vma, vmf->address, pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, vmf->address, pte); + if (!folio || folio_is_zone_device(folio)) goto out_map; /* TODO: handle PTE-mapped THP */ - if (PageCompound(page)) + if (folio_test_large(folio)) goto out_map; /* @@ -4796,34 +4796,34 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_NO_GROUP; /* - * Flag if the page is shared between multiple address spaces. This + * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; - page_nid = page_to_nid(page); + nid = folio_nid(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && - !node_is_toptier(page_nid)) + !node_is_toptier(nid)) last_cpupid = (-1 & LAST_CPUPID_MASK); else - last_cpupid = page_cpupid_last(page); - target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, - &flags); + last_cpupid = page_cpupid_last(&folio->page); + target_nid = numa_migrate_prep(&folio->page, vma, vmf->address, nid, + &flags); if (target_nid == NUMA_NO_NODE) { - put_page(page); + folio_put(folio); goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; /* Migrate to the requested node */ - if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) { - page_nid = target_nid; + if (migrate_misplaced_folio(folio, vma, target_nid)) { + nid = target_nid; flags |= TNF_MIGRATED; } else { flags |= TNF_MIGRATE_FAIL; @@ -4839,8 +4839,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } out: - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, 1, flags); + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, 1, flags); return 0; out_map: /* -- cgit v1.2.3 From cda6d93672ac5dd8af778a3f3e6082e12233b65b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:15 +0800 Subject: mm: memory: make numa_migrate_prep() to take a folio In preparation for large folio numa balancing, make numa_migrate_prep() to take a folio, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 865741d9b6b9..20b290c9dc87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4724,10 +4724,10 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { - get_page(page); + folio_get(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); @@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(page, vma, addr); + return mpol_misplaced(&folio->page, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -4812,8 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = page_cpupid_last(&folio->page); - target_nid = numa_migrate_prep(&folio->page, vma, vmf->address, nid, - &flags); + target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; -- cgit v1.2.3 From 75c70128a67311070115b90d826a229d4bbbb2b5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:16 +0800 Subject: mm: mempolicy: make mpol_misplaced() to take a folio In preparation for large folio numa balancing, make mpol_misplaced() to take a folio, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 20b290c9dc87..7ca16ed67634 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(&folio->page, vma, addr); + return mpol_misplaced(folio, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) -- cgit v1.2.3 From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 21 Aug 2023 19:15:13 +0500 Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement IOCTL to get and optionally clear info about PTEs", v33. *Motivation* The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch() retrieves the addresses of the pages that are written to in a region of virtual memory. This syscall is used in Windows applications and games etc. This syscall is being emulated in pretty slow manner in userspace. Our purpose is to enhance the kernel such that we translate it efficiently in a better way. Currently some out of tree hack patches are being used to efficiently emulate it in some kernels. We intend to replace those with these patches. So the whole gaming on Linux can effectively get benefit from this. It means there would be tons of users of this code. CRIU use case [2] was mentioned by Andrei and Danylo: > Use cases for migrating sparse VMAs are binaries sanitized with ASAN, > MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of > shadow memory [4]. Being able to migrate such binaries allows to highly > reduce the amount of work needed to identify and fix post-migration > crashes, which happen constantly. Andrei defines the following uses of this code: * it is more granular and allows us to track changed pages more effectively. The current interface can clear dirty bits for the entire process only. In addition, reading info about pages is a separate operation. It means we must freeze the process to read information about all its pages, reset dirty bits, only then we can start dumping pages. The information about pages becomes more and more outdated, while we are processing pages. The new interface solves both these downsides. First, it allows us to read pte bits and clear the soft-dirty bit atomically. It means that CRIU will not need to freeze processes to pre-dump their memory. Second, it clears soft-dirty bits for a specified region of memory. It means CRIU will have actual info about pages to the moment of dumping them. * The new interface has to be much faster because basic page filtering is happening in the kernel. With the old interface, we have to read pagemap for each page. *Implementation Evolution (Short Summary)* From the definition of GetWriteWatch(), we feel like kernel's soft-dirty feature can be used under the hood with some additions like: * reset soft-dirty flag for only a specific region of memory instead of clearing the flag for the entire process * get and clear soft-dirty flag for a specific region atomically So we decided to use ioctl on pagemap file to read or/and reset soft-dirty flag. But using soft-dirty flag, sometimes we get extra pages which weren't even written. They had become soft-dirty because of VMA merging and VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were able to by-pass this short coming by ignoring VM_SOFTDIRTY until David reported that mprotect etc messes up the soft-dirty flag while ignoring VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We discussed if we can revert these patches. But we could not reach to any conclusion. So at this point, I made couple of tries to solve this whole VM_SOFTDIRTY issue by correcting the soft-dirty implementation: * [7] Correct the bug fixed wrongly back in 2014. It had potential to cause regression. We left it behind. * [8] Keep a list of soft-dirty part of a VMA across splits and merges. I got the reply don't increase the size of the VMA by 8 bytes. At this point, we left soft-dirty considering it is too much delicate and userfaultfd [9] seemed like the only way forward. From there onward, we have been basing soft-dirty emulation on userfaultfd wp feature where kernel resolves the faults itself when WP_ASYNC feature is used. It was straight forward to add WP_ASYNC feature in userfautlfd. Now we get only those pages dirty or written-to which are really written in reality. (PS There is another WP_UNPOPULATED userfautfd feature is required which is needed to avoid pre-faulting memory before write-protecting [9].) All the different masks were added on the request of CRIU devs to create interface more generic and better. [1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch [2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com [3] https://github.com/google/sanitizers [4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit [5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com [6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/ [7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com [8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com [9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com [10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com This patch (of 6): Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows userfaultfd wr-protect faults to be resolved by the kernel directly. It can be used like a high accuracy version of soft-dirty, without vma modifications during tracking, and also with ranged support by default rather than for a whole mm when reset the protections due to existence of ioctl(UFFDIO_WRITEPROTECT). Several goals of such a dirty tracking interface: 1. All types of memory should be supported and tracable. This is nature for soft-dirty but should mention when the context is userfaultfd, because it used to only support anon/shmem/hugetlb. The problem is for a dirty tracking purpose these three types may not be enough, and it's legal to track anything e.g. any page cache writes from mmap. 2. Protections can be applied to partial of a memory range, without vma split/merge fuss. The hope is that the tracking itself should not affect any vma layout change. It also helps when reset happens because the reset will not need mmap write lock which can block the tracee. 3. Accuracy needs to be maintained. This means we need pte markers to work on any type of VMA. One could question that, the whole concept of async dirty tracking is not really close to fundamentally what userfaultfd used to be: it's not "a fault to be serviced by userspace" anymore. However, using userfaultfd-wp here as a framework is convenient for us in at least: 1. VM_UFFD_WP vma flag, which has a very good name to suite something like this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new feature bit to identify from a sync version of uffd-wp registration. 2. PTE markers logic can be leveraged across the whole kernel to maintain the uffd-wp bit as long as an arch supports, this also applies to this case where uffd-wp bit will be a hint to dirty information and it will not go lost easily (e.g. when some page cache ptes got zapped). 3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or resetting a range of memory, while there's no counterpart in the old soft-dirty world, hence if this is wanted in a new design we'll need a new interface otherwise. We can somehow understand that commonality because uffd-wp was fundamentally a similar idea of write-protecting pages just like soft-dirty. This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so far WP_ASYNC seems to not usable if without WP_UNPOPULATE. This also gives us chance to modify impl of WP_ASYNC just in case it could be not depending on WP_UNPOPULATED anymore in the future kernels. It's also fine to imply that because both features will rely on PTE_MARKER_UFFD_WP config option, so they'll show up together (or both missing) in an UFFDIO_API probe. vma_can_userfault() now allows any VMA if the userfaultfd registration is only about async uffd-wp. So we can track dirty for all kinds of memory including generic file systems (like XFS, EXT4 or BTRFS). One trick worth mention in do_wp_page() is that we need to manually update vmf->orig_pte here because it can be used later with a pte_same() check - this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags. The major defect of this approach of dirty tracking is we need to populate the pgtables when tracking starts. Soft-dirty doesn't do it like that. It's unwanted in the case where the range of memory to track is huge and unpopulated (e.g., tracking updates on a 10G file with mmap() on top, without having any page cache installed yet). One way to improve this is to allow pte markers exist for larger than PTE level for PMD+. That will not change the interface if to implemented, so we can leave that for later. Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com Signed-off-by: Peter Xu Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Cc: Michał Mirosław Signed-off-by: Andrew Morton --- mm/memory.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 2d209bc4d18c..d0f61d729fb4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c @@ -3349,11 +3350,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; + pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); + if (!userfaultfd_wp_async(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Nothing needed (cache flush, TLB invalidations, + * etc.) because we're only removing the uffd-wp bit, + * which is completely invisible to the user. + */ + pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); + + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + /* + * Update this to be prepared for following up CoW + * handling + */ + vmf->orig_pte = pte; } /* @@ -4879,8 +4897,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { + if (userfaultfd_wp_async(vmf->vma)) + goto split; return handle_userfault(vmf, VM_UFFD_WP); + } return do_huge_pmd_wp_page(vmf); } @@ -4892,6 +4913,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } } +split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); -- cgit v1.2.3 From 164b06f238b986317131e6b61b2f22aabcbc2cc0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:14 +0100 Subject: mm: call wp_page_copy() under the VMA lock It is usually safe to call wp_page_copy() under the VMA lock. The only unsafe situation is when no anon_vma has been allocated for this VMA, and we have to look at adjacent VMAs to determine if their anon_vma can be shared. Since this happens only for the first COW of a page in this VMA, the majority of calls to wp_page_copy() do not need to fall back to the mmap_sem. Add vmf_anon_prepare() as an alternative to anon_vma_prepare() which will return RETRY if we currently hold the VMA lock and need to allocate an anon_vma. This lets us drop the check in do_wp_page(). Link: https://lkml.kernel.org/r/20231006195318.4087158-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index d0f61d729fb4..c437d3562d18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (likely(vma->anon_vma)) + return 0; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + return 0; +} + /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. @@ -3069,27 +3084,29 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; - int ret; + vm_fault_t ret; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); if (!new_folio) goto oom; } else { + int err; new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false); if (!new_folio) goto oom; - ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); - if (ret) { + err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); + if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3102,7 +3119,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(old_folio); delayacct_wpcopy_end(); - return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } @@ -3212,11 +3229,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) oom_free_new: folio_put(new_folio); oom: + ret = VM_FAULT_OOM; +out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); - return VM_FAULT_OOM; + return ret; } /** @@ -3458,12 +3477,6 @@ reuse: return 0; } copy: - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Ok, we need to copy. Oh, well.. */ -- cgit v1.2.3 From 4ed4379881aa62588aba6442a9f362a8cf7624e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:15 +0100 Subject: mm: handle shared faults under the VMA lock There are many implementations of ->fault and some of them depend on mmap_lock being held. All vm_ops that implement ->map_pages() end up calling filemap_fault(), which I have audited to be sure it does not rely on mmap_lock. So (for now) key off ->map_pages existing as a flag to indicate that it's safe to call ->fault while only holding the vma lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index c437d3562d18..7b240a67f207 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +/* + * We could add a bitflag somewhere, but for now, we know that all + * vm_ops that have a ->map_pages have been audited and don't need + * the mmap_lock to be held. + */ +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) + return 0; + vma_end_read(vma); + return VM_FAULT_RETRY; +} + static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) vm_fault_t ret, tmp; struct folio *folio; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) -- cgit v1.2.3 From 4de8c93a4751e10737b6af65db42c743228c67a6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:16 +0100 Subject: mm: handle COW faults under the VMA lock If the page is not currently present in the page tables, we need to call the page fault handler to find out which page we're supposed to COW, so we need to both check that there is already an anon_vma and that the fault handler doesn't need the mmap_lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7b240a67f207..be9469a29413 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4639,13 +4639,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_can_call_fault(vmf); + if (!ret) + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) -- cgit v1.2.3 From 12214eba1992642eee5813a9cc9f626e5b2d1815 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:17 +0100 Subject: mm: handle read faults under the VMA lock Most file-backed faults are already handled through ->map_pages(), but if we need to do I/O we'll come this way. Since filemap_fault() is now safe to be called under the VMA lock, we can handle these faults under the VMA lock now. Link: https://lkml.kernel.org/r/20231006195318.4087158-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index be9469a29413..ada461b82a75 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4617,10 +4617,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) -- cgit v1.2.3 From 4a68fef16df9d88d528094116f8bbd2dbfa62089 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:18 +0100 Subject: mm: handle write faults to RO pages under the VMA lock I think this is a pretty rare occurrence, but for consistency handle faults with the VMA lock held the same way that we handle other faults with the VMA lock held. Link: https://lkml.kernel.org/r/20231006195318.4087158-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index ada461b82a75..ceffe96f2a0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3301,10 +3301,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); @@ -3328,10 +3327,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + tmp = vmf_can_call_fault(vmf); + if (tmp) { folio_put(folio); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; + return tmp; } tmp = do_page_mkwrite(vmf, folio); -- cgit v1.2.3 From 5ca432896a4ce6d69fffc3298b24c0dd9bdb871f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:47 +0200 Subject: mm/rmap: move SetPageAnonExclusive() out of page_move_anon_rmap() Patch series "mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()". Convert page_move_anon_rmap() to folio_move_anon_rmap(), letting the callers handle PageAnonExclusive. I'm including cleanup patch #3 because it fits into the picture and can be done cleaner by the conversion. This patch (of 3): Let's move it into the caller: there is a difference between whether an anon folio can only be mapped by one process (e.g., into one VMA), and whether it is truly exclusive (e.g., no references -- including GUP -- from other processes). Further, for large folios the page might not actually be pointing at the head page of the folio, so it better be handled in the caller. This is a preparation for converting page_move_anon_rmap() to consume a folio. Link: https://lkml.kernel.org/r/20231002142949.235104-1-david@redhat.com Link: https://lkml.kernel.org/r/20231002142949.235104-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index ceffe96f2a0e..21fba1c9a6c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3481,6 +3481,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * sunglasses. Hit it. */ page_move_anon_rmap(vmf->page, vma); + SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: if (unlikely(unshare)) { -- cgit v1.2.3 From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:48 +0200 Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap() Let's convert it to consume a folio. [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 21fba1c9a6c7..6c67828f934c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3480,7 +3480,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(vmf->page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: -- cgit v1.2.3 From dec078cc2181fccf8b134406b86aaacc19f7163f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:49 +0200 Subject: memory: move exclusivity detection in do_wp_page() into wp_can_reuse_anon_folio() Let's clean up do_wp_page() a bit, removing two labels and making it a easier to read. wp_can_reuse_anon_folio() now only operates on the whole folio. Move the SetPageAnonExclusive() out into do_wp_page(). No need to do this under page lock -- the page table lock is sufficient. Link: https://lkml.kernel.org/r/20231002142949.235104-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/memory.c | 88 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 43 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 6c67828f934c..22784d3b886d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3355,6 +3355,44 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) return ret; } +static bool wp_can_reuse_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + /* + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing + * the swapcache if there is little hope that we can reuse. + * + * KSM doesn't necessarily raise the folio refcount. + */ + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) + return false; + if (!folio_test_lru(folio)) + /* + * We cannot easily detect+handle references from + * remote LRU caches or references to LRU folios. + */ + lru_add_drain(); + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) + return false; + if (!folio_trylock(folio)) + return false; + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); + return false; + } + /* + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing + * sunglasses. Hit it. + */ + folio_move_anon_rmap(folio, vma); + folio_unlock(folio); + return true; +} + /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) @@ -3441,49 +3479,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. + * + * If we encounter a page that is marked exclusive, we must reuse + * the page without further checks. */ - if (folio && folio_test_anon(folio)) { - /* - * If the page is exclusive to this process we must reuse the - * page without further checks. - */ - if (PageAnonExclusive(vmf->page)) - goto reuse; - - /* - * We have to verify under folio lock: these early checks are - * just an optimization to avoid locking the folio and freeing - * the swapcache if there is little hope that we can reuse. - * - * KSM doesn't necessarily raise the folio refcount. - */ - if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) - goto copy; - if (!folio_test_lru(folio)) - /* - * We cannot easily detect+handle references from - * remote LRU caches or references to LRU folios. - */ - lru_add_drain(); - if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) - goto copy; - if (!folio_trylock(folio)) - goto copy; - if (folio_test_swapcache(folio)) - folio_free_swap(folio); - if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { - folio_unlock(folio); - goto copy; - } - /* - * Ok, we've got the only folio reference from our mapping - * and the folio is locked, it's dark out, and we're wearing - * sunglasses. Hit it. - */ - folio_move_anon_rmap(folio, vma); - SetPageAnonExclusive(vmf->page); - folio_unlock(folio); -reuse: + if (folio && folio_test_anon(folio) && + (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { + if (!PageAnonExclusive(vmf->page)) + SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3491,7 +3494,6 @@ reuse: wp_page_reuse(vmf); return 0; } -copy: /* * Ok, we need to copy. Oh, well.. */ -- cgit v1.2.3 From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:51 +0100 Subject: mm: make __access_remote_vm() static Patch series "various improvements to the GUP interface", v2. A series of fixes to simplify and improve the GUP interface with an eye to providing groundwork to future improvements:- * __access_remote_vm() and access_remote_vm() are functionally identical, so make the former static such that in future we can potentially change the external-facing implementation details of this function. * Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and simplify things by defining INTERNAL_GUP_FLAGS to check against. * Adjust __get_user_pages_locked() to explicitly treat a failure to pin any pages as an error in all circumstances other than FOLL_NOWAIT being specified, bringing it in line with the nommu implementation of this function. * (With many thanks to Arnd who suggested this in the first instance) Update get_user_page_vma_remote() to explicitly only return a page or an error, simplifying the interface and avoiding the questionable IS_ERR_OR_NULL() pattern. This patch (of 4): access_remote_vm() passes through parameters to __access_remote_vm() directly, so remove the __access_remote_vm() function from mm.h and use access_remote_vm() in the one caller that needs it (ptrace_access_vm()). This allows future adjustments to the GUP-internal __access_remote_vm() function while keeping the access_remote_vm() function stable. Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 22784d3b886d..e47c36c0aef0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5791,8 +5791,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys); /* * Access another process' address space as given in mm. */ -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; -- cgit v1.2.3 From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:54 +0100 Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL get_user_pages_remote() will never return 0 except in the case of FOLL_NOWAIT being specified, which we explicitly disallow. This simplifies error handling for the caller and avoids the awkwardness of dealing with both errors and failing to pin. Failing to pin here is an error. Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Acked-by: Catalin Marinas Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e47c36c0aef0..1f88e4f6fbf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5815,7 +5815,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); - if (IS_ERR_OR_NULL(page)) { + if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { @@ -5829,7 +5829,6 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, continue; } - /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. -- cgit v1.2.3 From 67b33e3ff58374b3fca929933ccc04a1858fda6a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:50 +0800 Subject: mm: memory: use folio_last_cpupid() in do_numa_page() Convert to use folio_last_cpupid() in do_numa_page(). Link: https://lkml.kernel.org/r/20231018140806.2783514-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 1f88e4f6fbf2..0915eac772fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4858,7 +4858,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) !node_is_toptier(nid)) last_cpupid = (-1 & LAST_CPUPID_MASK); else - last_cpupid = page_cpupid_last(&folio->page); + last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); -- cgit v1.2.3 From c08b7e3830dbf24dd2552ddeea84f00393842f1b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:03 +0800 Subject: mm: make finish_mkwrite_fault() static Make finish_mkwrite_fault static since it is not used outside of memory.c. Link: https://lkml.kernel.org/r/20231018140806.2783514-17-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0915eac772fe..2b6e2b239411 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3269,7 +3269,7 @@ out: * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ -vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, -- cgit v1.2.3 From a86bc96b77df40c27ead5ef4ac3837904b7eb53f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:04 +0800 Subject: mm: convert wp_page_reuse() and finish_mkwrite_fault() to take a folio Saves one compound_head() call, also in preparation for page_cpupid_xchg_last() conversion. Link: https://lkml.kernel.org/r/20231018140806.2783514-18-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 2b6e2b239411..94791c5e2951 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3015,7 +3015,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline void wp_page_reuse(struct vm_fault *vmf) +static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -3023,7 +3023,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); + VM_BUG_ON(folio && folio_test_anon(folio) && !PageAnonExclusive(page)); /* * Clear the pages cpupid information as the existing @@ -3258,6 +3258,7 @@ out: * writeable once the page is prepared * * @vmf: structure describing the fault + * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. @@ -3269,7 +3270,7 @@ out: * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ -static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, @@ -3285,7 +3286,7 @@ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); return 0; } @@ -3309,9 +3310,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - return finish_mkwrite_fault(vmf); + return finish_mkwrite_fault(vmf, NULL); } - wp_page_reuse(vmf); + wp_page_reuse(vmf, NULL); return 0; } @@ -3339,14 +3340,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) folio_put(folio); return tmp; } - tmp = finish_mkwrite_fault(vmf); + tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); @@ -3491,7 +3492,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); return 0; } /* -- cgit v1.2.3 From c2c3b5148052cef670d359b81d338d20b96bf47f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:05 +0800 Subject: mm: use folio_xchg_last_cpupid() in wp_page_reuse() Convert to use folio_xchg_last_cpupid() in wp_page_reuse(), and remove page variable. Since now only normal and PMD-mapped page is handled by numa balancing, it's enough to only update the entire folio's last cpupid. Link: https://lkml.kernel.org/r/20231018140806.2783514-19-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 94791c5e2951..1f18ed4a5497 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3019,19 +3019,20 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(folio && folio_test_anon(folio) && !PageAnonExclusive(page)); - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (page) - page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); + if (folio) { + VM_BUG_ON(folio_test_anon(folio) && + !PageAnonExclusive(vmf->page)); + /* + * Clear the folio's cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); + } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); -- cgit v1.2.3