From 954652b9f33bb1892ea4448479d78779e4a8ae13 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 29 Aug 2023 17:45:49 -0700 Subject: mm/mremap: fix unaccount of memory on vma_merge() failure Fix mremap so that only accounted memory is unaccounted if the mapping is expandable but vma_merge() fails. Link: https://lkml.kernel.org/r/20230830004549.16131-1-anthony.yznaga@oracle.com Fixes: fdbef6149135 ("mm/mremap: don't account pages in vma_to_resize()") Signed-off-by: Anthony Yznaga Acked-by: Brian Geffon Signed-off-by: Andrew Morton --- mm/mremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/mremap.c') diff --git a/mm/mremap.c b/mm/mremap.c index 382e81c33fc4..fbb4861964f6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1037,12 +1037,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); VMA_ITERATOR(vmi, mm, extension_start); + long charged = 0; if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { ret = -ENOMEM; goto out; } + charged = pages; } /* @@ -1058,7 +1060,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (!vma) { - vm_unacct_memory(pages); + vm_unacct_memory(charged); ret = -ENOMEM; goto out; } -- cgit v1.2.3 From af8ca1c149069176e6322a77b532e3ffd99ccffe Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:22 +0000 Subject: mm/mremap: optimize the start addresses in move_page_tables() Patch series "Optimize mremap during mutual alignment within PMD", v6. This patchset optimizes the start addresses in move_page_tables() and tests the changes. It addresses a warning [1] that occurs due to a downward, overlapping move on a mutually-aligned offset within a PMD during exec. By initiating the copy process at the PMD level when such alignment is present, we can prevent this warning and speed up the copying process at the same time. Linus Torvalds suggested this idea. Check the individual patches for more details. [1] https://lore.kernel.org/all/ZB2GTBD%2FLWTrkOiO@dhcp22.suse.cz/ This patch (of 7): Recently, we see reports [1] of a warning that triggers due to move_page_tables() doing a downward and overlapping move on a mutually-aligned offset within a PMD. By mutual alignment, I mean the source and destination addresses of the mremap are at the same offset within a PMD. This mutual alignment along with the fact that the move is downward is sufficient to cause a warning related to having an allocated PMD that does not have PTEs in it. This warning will only trigger when there is mutual alignment in the move operation. A solution, as suggested by Linus Torvalds [2], is to initiate the copy process at the PMD level whenever such alignment is present. Implementing this approach will not only prevent the warning from being triggered, but it will also optimize the operation as this method should enhance the speed of the copy process whenever there's a possibility to start copying at the PMD level. Some more points: a. The optimization can be done only when both the source and destination of the mremap do not have anything mapped below it up to a PMD boundary. I add support to detect that. b. #1 is not a problem for the call to move_page_tables() from exec.c as nothing is expected to be mapped below the source. However, for non-overlapping mutually aligned moves as triggered by mremap(2), I added support for checking such cases. c. I currently only optimize for PMD moves, in the future I/we can build on this work and do PUD moves as well if there is a need for this. But I want to take it one step at a time. d. We need to be careful about mremap of ranges within the VMA itself. For this purpose, I added checks to determine if the address after alignment falls within its VMA itself. [1] https://lore.kernel.org/all/ZB2GTBD%2FLWTrkOiO@dhcp22.suse.cz/ [2] https://lore.kernel.org/all/CAHk-=whd7msp8reJPfeGNyt0LiySMT0egExx3TVZSX3Ok6X=9g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230903151328.2981432-1-joel@joelfernandes.org Link: https://lkml.kernel.org/r/20230903151328.2981432-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Suggested-by: Linus Torvalds Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mremap.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'mm/mremap.c') diff --git a/mm/mremap.c b/mm/mremap.c index fbb4861964f6..e2b65a17148e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -489,6 +489,53 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, return moved; } +/* + * A helper to check if a previous mapping exists. Required for + * move_page_tables() and realign_addr() to determine if a previous mapping + * exists before we can do realignment optimizations. + */ +static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, + unsigned long mask) +{ + unsigned long addr_masked = addr_to_align & mask; + + /* + * If @addr_to_align of either source or destination is not the beginning + * of the corresponding VMA, we can't align down or we will destroy part + * of the current mapping. + */ + if (vma->vm_start != addr_to_align) + return false; + + /* + * Make sure the realignment doesn't cause the address to fall on an + * existing mapping. + */ + return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; +} + +/* Opportunistically realign to specified boundary for faster copy. */ +static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, + unsigned long *new_addr, struct vm_area_struct *new_vma, + unsigned long mask) +{ + /* Skip if the addresses are already aligned. */ + if ((*old_addr & ~mask) == 0) + return; + + /* Only realign if the new and old addresses are mutually aligned. */ + if ((*old_addr & ~mask) != (*new_addr & ~mask)) + return; + + /* Ensure realignment doesn't cause overlap with existing mappings. */ + if (!can_align_down(old_vma, *old_addr, mask) || + !can_align_down(new_vma, *new_addr, mask)) + return; + + *old_addr = *old_addr & mask; + *new_addr = *new_addr & mask; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -508,6 +555,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma, return move_hugetlb_page_tables(vma, new_vma, old_addr, new_addr, len); + /* + * If possible, realign addresses to PMD boundary for faster copy. + * Only realign if the mremap copying hits a PMD boundary. + */ + if ((vma != new_vma) + && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); @@ -577,6 +632,13 @@ again: mmu_notifier_invalidate_range_end(&range); + /* + * Prevent negative return values when {old,new}_addr was realigned + * but we broke out of the above loop for the first PMD itself. + */ + if (len + old_addr < old_end) + return 0; + return len + old_addr - old_end; /* how much done */ } -- cgit v1.2.3 From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:23 +0000 Subject: mm/mremap: allow moves within the same VMA for stack moves For the stack move happening in shift_arg_pages(), the move is happening within the same VMA which spans the old and new ranges. In case the aligned address happens to fall within that VMA, allow such moves and don't abort the mremap alignment optimization. In the regular non-stack mremap case, we cannot allow any such moves as will end up destroying some part of the mapping (either the source of the move, or part of the existing mapping). So just avoid it for stack moves. Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mremap.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm/mremap.c') diff --git a/mm/mremap.c b/mm/mremap.c index e2b65a17148e..ce8a23ef325a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, } /* - * A helper to check if a previous mapping exists. Required for - * move_page_tables() and realign_addr() to determine if a previous mapping - * exists before we can do realignment optimizations. + * A helper to check if aligning down is OK. The aligned address should fall + * on *no mapping*. For the stack moving down, that's a special move within + * the VMA that is created to span the source and destination of the move, + * so we make an exception for it. */ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, - unsigned long mask) + unsigned long mask, bool for_stack) { unsigned long addr_masked = addr_to_align & mask; @@ -504,9 +505,13 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ - if (vma->vm_start != addr_to_align) + if (!for_stack && vma->vm_start != addr_to_align) return false; + /* In the stack case we explicitly permit in-VMA alignment. */ + if (for_stack && addr_masked >= vma->vm_start) + return true; + /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. @@ -517,7 +522,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali /* Opportunistically realign to specified boundary for faster copy. */ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, unsigned long *new_addr, struct vm_area_struct *new_vma, - unsigned long mask) + unsigned long mask, bool for_stack) { /* Skip if the addresses are already aligned. */ if ((*old_addr & ~mask) == 0) @@ -528,8 +533,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old return; /* Ensure realignment doesn't cause overlap with existing mappings. */ - if (!can_align_down(old_vma, *old_addr, mask) || - !can_align_down(new_vma, *new_addr, mask)) + if (!can_align_down(old_vma, *old_addr, mask, for_stack) || + !can_align_down(new_vma, *new_addr, mask, for_stack)) return; *old_addr = *old_addr & mask; @@ -539,7 +544,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks) + bool need_rmap_locks, bool for_stack) { unsigned long extent, old_end; struct mmu_notifier_range range; @@ -559,9 +564,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ - if ((vma != new_vma) - && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, + for_stack); flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, @@ -708,7 +713,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks); + need_rmap_locks, false); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { @@ -722,7 +727,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * and then proceed to unmap new area instead of old. */ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true); + true, false); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit v1.2.3 From 93bf5d4aa27d4ba528f71483ae51fbd70edb3ce8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:31 +0100 Subject: mm: abstract VMA merge and extend into vma_merge_extend() helper mremap uses vma_merge() in the case where a VMA needs to be extended. This can be significantly simplified and abstracted. This makes it far easier to understand what the actual function is doing, avoids future mistakes in use of the confusing vma_merge() function and importantly allows us to make future changes to how vma_merge() is implemented by knowing explicitly which merge cases each invocation uses. Note that in the mremap() extend case, we perform this merge only when old_len == vma->vm_end - addr. The extension_start, i.e. the start of the extended portion of the VMA is equal to addr + old_len, i.e. vma->vm_end. With this refactoring, vma_merge() is no longer required anywhere except mm/mmap.c, so mark it static. Link: https://lkml.kernel.org/r/f16cbdc2e72d37a1a097c39dc7d1fee8919a1c93.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mremap.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'mm/mremap.c') diff --git a/mm/mremap.c b/mm/mremap.c index ce8a23ef325a..38d98465f3d8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1096,14 +1096,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* old_len exactly to the end of the area.. */ if (old_len == vma->vm_end - addr) { + unsigned long delta = new_len - old_len; + /* can we just expand the current mapping? */ - if (vma_expandable(vma, new_len - old_len)) { - long pages = (new_len - old_len) >> PAGE_SHIFT; - unsigned long extension_start = addr + old_len; - unsigned long extension_end = addr + new_len; - pgoff_t extension_pgoff = vma->vm_pgoff + - ((extension_start - vma->vm_start) >> PAGE_SHIFT); - VMA_ITERATOR(vmi, mm, extension_start); + if (vma_expandable(vma, delta)) { + long pages = delta >> PAGE_SHIFT; + VMA_ITERATOR(vmi, mm, vma->vm_end); long charged = 0; if (vma->vm_flags & VM_ACCOUNT) { @@ -1115,17 +1113,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we - * are adding to the already existing vma, vma_merge() - * will merge this extension with the already existing - * vma (expand operation itself) and possibly also with - * the next vma if it becomes adjacent to the expanded - * vma and otherwise compatible. + * Function vma_merge_extend() is called on the + * extension we are adding to the already existing vma, + * vma_merge_extend() will merge this extension with the + * already existing vma (expand operation itself) and + * possibly also with the next vma if it becomes + * adjacent to the expanded vma and otherwise + * compatible. */ - vma = vma_merge(&vmi, mm, vma, extension_start, - extension_end, vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vma_merge_extend(&vmi, vma, delta); if (!vma) { vm_unacct_memory(charged); ret = -ENOMEM; -- cgit v1.2.3