From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Aug 2022 22:56:40 +0200 Subject: mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know that FOLL_FORCE can be possibly dangerous, especially if there are races that can be exploited by user space. Right now, it would be sufficient to have some code that sets a PTE of a R/O-mapped shared page dirty, in order for it to erroneously become writable by FOLL_FORCE. The implications of setting a write-protected PTE dirty might not be immediately obvious to everyone. And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map a shmem page R/O while marking the pte dirty. This can be used by unprivileged user space to modify tmpfs/shmem file content even if the user does not have write permissions to the file, and to bypass memfd write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590). To fix such security issues for good, the insight is that we really only need that fancy retry logic (FOLL_COW) for COW mappings that are not writable (!VM_WRITE). And in a COW mapping, we really only broke COW if we have an exclusive anonymous page mapped. If we have something else mapped, or the mapped anonymous page might be shared (!PageAnonExclusive), we have to trigger a write fault to break COW. If we don't find an exclusive anonymous page when we retry, we have to trigger COW breaking once again because something intervened. Let's move away from this mandatory-retry + dirty handling and rely on our PageAnonExclusive() flag for making a similar decision, to use the same COW logic as in other kernel parts here as well. In case we stumble over a PTE in a COW mapping that does not map an exclusive anonymous page, COW was not properly broken and we have to trigger a fake write-fault to break COW. Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection") and commit 76aefad628aa ("mm/mprotect: fix soft-dirty check in can_change_pte_writable()"), take care of softdirty and uffd-wp manually. For example, a write() via /proc/self/mem to a uffd-wp-protected range has to fail instead of silently granting write access and bypassing the userspace fault handler. Note that FOLL_FORCE is not only used for debug access, but also triggered by applications without debug intentions, for example, when pinning pages via RDMA. This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR. Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So let's just get rid of it. Thanks to Nadav Amit for pointing out that the pte_dirty() check in FOLL_FORCE code is problematic and might be exploitable. Note 1: We don't check for the PTE being dirty because it doesn't matter for making a "was COWed" decision anymore, and whoever modifies the page has to set the page dirty either way. Note 2: Kernels before extended uffd-wp support and before PageAnonExclusive (< 5.19) can simply revert the problematic commit instead and be safe regarding UFFDIO_CONTINUE. A backport to v5.19 requires minor adjustments due to lack of vma_soft_dirty_enabled(). Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte") Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Axel Rasmussen Cc: Nadav Amit Cc: Peter Xu Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: John Hubbard Cc: Jason Gunthorpe Cc: David Laight Cc: [5.16] Signed-off-by: Andrew Morton --- mm/gup.c | 68 ++++++++++++++++++++++++++++++++++++++------------------ mm/huge_memory.c | 64 +++++++++++++++++++++++++++++++++++----------------- 2 files changed, 89 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 732825157430..5abdaf487460 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } -/* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. - */ -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ +static inline bool can_follow_write_pte(pte_t pte, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + /* If the pte is writable, we can write to the page. */ + if (pte_write(pte)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + return false; + return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -528,12 +556,19 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { - pte_unmap_unlock(ptep, ptl); - return NULL; - } page = vm_normal_page(vma, address, pte); + + /* + * We only care about anon pages in can_follow_write_pte() and don't + * have to worry about pte_devmap() because they are never anon. + */ + if ((flags & FOLL_WRITE) && + !can_follow_write_pte(pte, page, vma, flags)) { + page = NULL; + goto out; + } + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN @@ -986,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma, return -EBUSY; } - /* - * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when - * necessary, even if maybe_mkwrite decided not to set pte_write. We - * can thus safely do subsequent page lookups as if they were reads. - * But only do so when looping for pte_write is futile: in some cases - * userspace may also be wanting to write to the gotten user page, - * which a read fault here might prevent (a readonly page might get - * reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags |= FOLL_COW; return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8a7c1b344abe..e9414ee57c5b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); - /* - * When we COW a devmap PMD entry, we split it into PTEs, so we should - * not be in this function with `flags & FOLL_COW` set. - */ - WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) @@ -1395,14 +1389,42 @@ fallback: return VM_FAULT_FALLBACK; } -/* - * FOLL_FORCE can write to even unwritable pmd's, but only - * after we've gone through a COW cycle and they are dirty. - */ -static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + /* If the pmd is writable, we can write to the page. */ + if (pmd_write(pmd)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + return !userfaultfd_huge_pmd_wp(vma, pmd); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1411,12 +1433,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; + struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) - goto out; + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if ((flags & FOLL_WRITE) && + !can_follow_write_pmd(*pmd, page, vma, flags)) + return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) @@ -1424,10 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, /* Full NUMA hinting faults to serialise migration in fault paths */ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) - goto out; - - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + return NULL; if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) return ERR_PTR(-EMLINK); @@ -1444,7 +1467,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); -out: return page; } -- cgit v1.2.3 From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sun, 7 Aug 2022 15:44:42 +0000 Subject: mm: add DEVICE_ZONE to FOR_ALL_ZONES FOR_ALL_ZONES should be consistent with enum zone_type. Otherwise, __count_zid_vm_events have the potential to add count to wrong item when zid is ZONE_DEVICE. Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io Signed-off-by: Hao Lee Cc: David Hildenbrand Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmstat.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 373d2730fcf2..90af9a8572f5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1168,8 +1168,15 @@ int fragmentation_index(struct zone *zone, unsigned int order) #define TEXT_FOR_HIGHMEM(xx) #endif +#ifdef CONFIG_ZONE_DEVICE +#define TEXT_FOR_DEVICE(xx) xx "_device", +#else +#define TEXT_FOR_DEVICE(xx) +#endif + #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", + TEXT_FOR_HIGHMEM(xx) xx "_movable", \ + TEXT_FOR_DEVICE(xx) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ -- cgit v1.2.3 From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 16:13:40 -0400 Subject: mm/uffd: reset write protection when unregister with wp-mode The motivation of this patch comes from a recent report and patchfix from David Hildenbrand on hugetlb shared handling of wr-protected page [1]. With the reproducer provided in commit message of [1], one can leverage the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect not only the attacker process, but also the whole system. The lazy-reset mechanism of uffd-wp was used to make unregister faster, meanwhile it has an assumption that any leftover pgtable entries should only affect the process on its own, so not only the user should be aware of anything it does, but also it should not affect outside of the process. But it seems that this is not true, and it can also be utilized to make some exploit easier. So far there's no clue showing that the lazy-reset is important to any userfaultfd users because normally the unregister will only happen once for a specific range of memory of the lifecycle of the process. Considering all above, what this patch proposes is to do explicit pte resets when unregister an uffd region with wr-protect mode enabled. It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false) right before ioctl(UFFDIO_UNREGISTER) for the user. So potentially it'll make the unregister slower. From that pov it's a very slight abi change, but hopefully nothing should break with this change either. Regarding to the change itself - core of uffd write [un]protect operation is moved into a separate function (uffd_wp_range()) and it is reused in the unregister code path. Note that the new function will not check for anything, e.g. ranges or memory types, because they should have been checked during the previous UFFDIO_REGISTER or it should have failed already. It also doesn't check mmap_changing because we're with mmap write lock held anyway. I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's the only issue reported so far and that's the commit David's reproducer will start working (v5.19+). But the whole idea actually applies to not only file memories but also anonymous. It's just that we don't need to fix anonymous prior to v5.19- because there's no known way to exploit. IOW, this patch can also fix the issue reported in [1] as the patch 2 does. [1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/ Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: David Hildenbrand Cc: Mike Rapoport Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 07d3befc80e4..7327b2573f7c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } +void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, + unsigned long start, unsigned long len, bool enable_wp) +{ + struct mmu_gather tlb; + pgprot_t newprot; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + tlb_gather_mmu(&tlb, dst_mm); + change_protection(&tlb, dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + tlb_finish_mmu(&tlb); +} + int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; unsigned long page_mask; - struct mmu_gather tlb; - pgprot_t newprot; int err; /* @@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - if (enable_wp) - newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); - else - newprot = vm_get_page_prot(dst_vma->vm_flags); - - tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, - enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); - tlb_finish_mmu(&tlb); + uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); err = 0; out_unlock: -- cgit v1.2.3 From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Aug 2022 12:34:34 +0200 Subject: mm/hugetlb: fix hugetlb not supporting softdirty tracking Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2. I observed that hugetlb does not support/expect write-faults in shared mappings that would have to map the R/O-mapped page writable -- and I found two case where we could currently get such faults and would erroneously map an anon page into a shared mapping. Reproducers part of the patches. I propose to backport both fixes to stable trees. The first fix needs a small adjustment. This patch (of 2): Staring at hugetlb_wp(), one might wonder where all the logic for shared mappings is when stumbling over a write-protected page in a shared mapping. In fact, there is none, and so far we thought we could get away with that because e.g., mprotect() should always do the right thing and map all pages directly writable. Looks like we were wrong: -------------------------------------------------------------------------- #include #include #include #include #include #include #include #define HUGETLB_SIZE (2 * 1024 * 1024u) static void clear_softdirty(void) { int fd = open("/proc/self/clear_refs", O_WRONLY); const char *ctrl = "4"; int ret; if (fd < 0) { fprintf(stderr, "open(clear_refs) failed\n"); exit(1); } ret = write(fd, ctrl, strlen(ctrl)); if (ret != strlen(ctrl)) { fprintf(stderr, "write(clear_refs) failed\n"); exit(1); } close(fd); } int main(int argc, char **argv) { char *map; int fd; fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT); if (!fd) { fprintf(stderr, "open() failed\n"); return -errno; } if (ftruncate(fd, HUGETLB_SIZE)) { fprintf(stderr, "ftruncate() failed\n"); return -errno; } map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } *map = 0; if (mprotect(map, HUGETLB_SIZE, PROT_READ)) { fprintf(stderr, "mmprotect() failed\n"); return -errno; } clear_softdirty(); if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) { fprintf(stderr, "mmprotect() failed\n"); return -errno; } *map = 0; return 0; } -------------------------------------------------------------------------- Above test fails with SIGBUS when there is only a single free hugetlb page. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test Bus error (core dumped) And worse, with sufficient free hugetlb pages it will map an anonymous page into a shared mapping, for example, messing up accounting during unmap and breaking MAP_SHARED semantics: # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test # cat /proc/meminfo | grep HugePages_ HugePages_Total: 2 HugePages_Free: 1 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Reason in this particular case is that vma_wants_writenotify() will return "true", removing VM_SHARED in vma_set_page_prot() to map pages write-protected. Let's teach vma_wants_writenotify() that hugetlb does not support softdirty tracking. Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared") Signed-off-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Peter Feiner Cc: Kirill A. Shutemov Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Jamie Liu Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Bjorn Helgaas Cc: Muchun Song Cc: Peter Xu Cc: [3.18+] Signed-off-by: Andrew Morton --- mm/mmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index c035020d0c89..9d780f415be3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; - /* Do we need to track softdirty? */ - if (vma_soft_dirty_enabled(vma)) + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return 1; /* Specialty mapping? */ -- cgit v1.2.3 From 1d8d14641fd94a01b20a4abbf2749fd8eddcf57b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Aug 2022 12:34:35 +0200 Subject: mm/hugetlb: support write-faults in shared mappings If we ever get a write-fault on a write-protected page in a shared mapping, we'd be in trouble (again). Instead, we can simply map the page writable. And in fact, there is even a way right now to trigger that code via uffd-wp ever since we stared to support it for shmem in 5.19: -------------------------------------------------------------------------- #include #include #include #include #include #include #include #include #include #include #define HUGETLB_SIZE (2 * 1024 * 1024u) static char *map; int uffd; static int temp_setup_uffd(void) { struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; struct uffdio_writeprotect uffd_writeprotect; struct uffdio_range uffd_range; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd < 0) { fprintf(stderr, "syscall() failed: %d\n", errno); return -errno; } uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { fprintf(stderr, "UFFDIO_API failed: %d\n", errno); return -errno; } if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); return -ENOSYS; } /* Register UFFD-WP */ uffdio_register.range.start = (unsigned long) map; uffdio_register.range.len = HUGETLB_SIZE; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); return -errno; } /* Writeprotect a single page. */ uffd_writeprotect.range.start = (unsigned long) map; uffd_writeprotect.range.len = HUGETLB_SIZE; uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); return -errno; } /* Unregister UFFD-WP without prior writeunprotection. */ uffd_range.start = (unsigned long) map; uffd_range.len = HUGETLB_SIZE; if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) { fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno); return -errno; } return 0; } int main(int argc, char **argv) { int fd; fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT); if (!fd) { fprintf(stderr, "open() failed\n"); return -errno; } if (ftruncate(fd, HUGETLB_SIZE)) { fprintf(stderr, "ftruncate() failed\n"); return -errno; } map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } *map = 0; if (temp_setup_uffd()) return 1; *map = 0; return 0; } -------------------------------------------------------------------------- Above test fails with SIGBUS when there is only a single free hugetlb page. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test Bus error (core dumped) And worse, with sufficient free hugetlb pages it will map an anonymous page into a shared mapping, for example, messing up accounting during unmap and breaking MAP_SHARED semantics: # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test # cat /proc/meminfo | grep HugePages_ HugePages_Total: 2 HugePages_Free: 1 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when unregistering and consequently keeps the PTE writeprotected. Reason for this is to avoid the additional overhead when unregistering. Note that this is the case also for !hugetlb and that we will end up with writable PTEs that still have the uffd-wp PTE bit set once we return from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it seems to be a generic thing -- wp_page_reuse() also doesn't clear it. VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates that MAP_SHARED handling was at least envisioned, but could never have worked as expected. While at it, make sure that we never end up in hugetlb_wp() on write faults without VM_WRITE, because we don't support maybe_mkwrite() semantics as commonly used in the !hugetlb case -- for example, in wp_page_reuse(). Note that there is no need to do any kind of reservation in hugetlb_fault() in this case ... because we already have a hugetlb page mapped R/O that we will simply map writable and we are not dealing with COW/unsharing. Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Bjorn Helgaas Cc: Cyrill Gorcunov Cc: Hugh Dickins Cc: Jamie Liu Cc: Kirill A. Shutemov Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pavel Emelyanov Cc: Peter Feiner Cc: Peter Xu Cc: [5.19] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0aee2f3ae15c..2480ba627aa5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5241,6 +5241,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON(unshare && (flags & FOLL_WRITE)); VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); + /* + * hugetlb does not support FOLL_FORCE-style write faults that keep the + * PTE mapped R/O such as maybe_mkwrite() would do. + */ + if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) + return VM_FAULT_SIGSEGV; + + /* Let's take out MAP_SHARED mappings first. */ + if (vma->vm_flags & VM_MAYSHARE) { + if (unlikely(unshare)) + return 0; + set_huge_ptep_writable(vma, haddr, ptep); + return 0; + } + pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -5781,12 +5796,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the - * spinlock. For private mappings, we also lookup the pagecache - * page now as it is used to determine if a reservation has been - * consumed. + * spinlock. Also lookup the pagecache page now as it is used to + * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !huge_pte_write(entry)) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -5794,9 +5808,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - if (!(vma->vm_flags & VM_MAYSHARE)) - pagecache_page = hugetlbfs_pagecache_page(h, - vma, haddr); + pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); -- cgit v1.2.3 From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 21:51:09 -0700 Subject: mm/shmem: fix chattr fsflags support in tmpfs ext[234] have always allowed unimplemented chattr flags to be set, but other filesystems have tended to be stricter. Follow the stricter approach for tmpfs: I don't want to have to explain why csu attributes don't actually work, and we won't need to update the chattr(1) manpage; and it's never wrong to start off strict, relaxing later if persuaded. Allow only a (append only) i (immutable) A (no atime) and d (no dump). Although lsattr showed 'A' inherited, the NOATIME behavior was not being inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME. Add shmem_set_inode_flags() to sync the flags, using inode_set_flags() to avoid that instant of lost immutablility during fileattr_set(). But that change switched generic/079 from passing to failing: because FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the INHERITED fsflags: remove them and generic/079 is back to passing. Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs") Signed-off-by: Hugh Dickins Cc: "Theodore Ts'o" Cc: Radoslaw Burny Cc: "Darrick J. Wong" Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 54 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 5783f11351bb..170b4078420f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2281,16 +2281,34 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -/* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); + +/* + * chattr's fsflags are unrelated to extended attributes, + * but tmpfs has chosen to enable them under the same config option. + */ +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ + unsigned int i_flags = 0; + + if (fsflags & FS_NOATIME_FL) + i_flags |= S_NOATIME; + if (fsflags & FS_APPEND_FL) + i_flags |= S_APPEND; + if (fsflags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + /* + * But FS_NODUMP_FL does not require any action in i_flags. + */ + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); +} +#else +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & SHMEM_REG_FLMASK; - else - return flags & SHMEM_OTHER_FLMASK; } +#define shmem_initxattrs NULL +#endif static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) @@ -2319,7 +2337,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, info->i_crtime = inode->i_mtime; info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; - info->fsflags = shmem_mask_flags(mode, info->fsflags); + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -2468,12 +2487,6 @@ out_unacct_blocks: static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -#ifdef CONFIG_TMPFS_XATTR -static int shmem_initxattrs(struct inode *, const struct xattr *, void *); -#else -#define shmem_initxattrs NULL -#endif - static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, @@ -3179,18 +3192,13 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; + if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) + return -EOPNOTSUPP; info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); - if (info->fsflags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; - if (info->fsflags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (info->fsflags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; - + shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); return 0; } -- cgit v1.2.3 From 15f242bb65b89d5f1ff990668a586fdf1307b2c8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 21:55:36 -0700 Subject: mm/shmem: tmpfs fallocate use file_modified() 5.18 fixed the btrfs and ext4 fallocates to use file_modified(), as xfs was already doing, to drop privileges: and fstests generic/{683,684,688} expect this. There's no need to argue over keep-size allocation (which could just update ctime): fix shmem_fallocate() to behave the same way. Link: https://lkml.kernel.org/r/39c5e62-4896-7795-c0a0-f79c50d4909@google.com Signed-off-by: Hugh Dickins Acked-by: Christian Brauner (Microsoft) Cc: "Darrick J. Wong" Cc: Matthew Wilcox (Oracle) Cc: Radoslaw Burny Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 170b4078420f..ce2090744c5e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2839,12 +2839,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = current_time(inode); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: + if (!error) + file_modified(file); inode_unlock(inode); return error; } -- cgit v1.2.3 From 76d36dea02691a8ffa8cd7368eecbf727b8a1c0c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 22:06:33 -0700 Subject: mm/shmem: shmem_replace_page() remember NR_SHMEM Elsewhere, NR_SHMEM is updated at the same time as shmem NR_FILE_PAGES; but shmem_replace_page() was forgetting to do that - so NR_SHMEM stats could grow too big or too small, in those unusual cases when it's used. Link: https://lkml.kernel.org/r/cec7c09d-5874-e160-ada6-6e10ee48784@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: "Darrick J. Wong" Cc: Radoslaw Burny Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index ce2090744c5e..d075dd2dcc48 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1659,7 +1659,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, new = page_folio(newpage); mem_cgroup_migrate(old, new); __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __inc_lruvec_page_state(newpage, NR_SHMEM); __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_SHMEM); } xa_unlock_irq(&swap_mapping->i_pages); -- cgit v1.2.3 From ab74ef708dc51df7cf2b8a890b9c6990fac5c0c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Jul 2022 21:05:42 +0800 Subject: mm/hugetlb: avoid corrupting page->mapping in hugetlb_mcopy_atomic_pte In MCOPY_ATOMIC_CONTINUE case with a non-shared VMA, pages in the page cache are installed in the ptes. But hugepage_add_new_anon_rmap is called for them mistakenly because they're not vm_shared. This will corrupt the page->mapping used by page cache code. Link: https://lkml.kernel.org/r/20220712130542.18836-1-linmiaohe@huawei.com Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl") Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Axel Rasmussen Cc: Peter Xu Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2480ba627aa5..e070b8593b37 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6041,7 +6041,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (vm_shared) { + if (page_in_pagecache) { page_dup_file_rmap(page, true); } else { ClearHPageRestoreReserve(page); -- cgit v1.2.3 From 9dfb3b8d655022760ca68af11821f1c63aa547c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 30 Jul 2022 05:25:18 +0100 Subject: shmem: update folio if shmem_replace_page() updates the page If we allocate a new page, we need to make sure that our folio matches that new page. If we do end up in this code path, we store the wrong page in the shmem inode's page cache, and I would rather imagine that data corruption ensues. This will be solved by changing shmem_replace_page() to shmem_replace_folio(), but this is the minimal fix. Link: https://lkml.kernel.org/r/20220730042518.1264767-1-willy@infradead.org Fixes: da08e9b79323 ("mm/shmem: convert shmem_swapin_page() to shmem_swapin_folio()") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index d075dd2dcc48..42e5888bf84d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1782,6 +1782,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_page(&page, gfp, info, index); + folio = page_folio(page); if (error) goto failed; } -- cgit v1.2.3 From f87904c075515f3e1d8f4a7115869d3b914674fd Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Mon, 1 Aug 2022 08:50:34 -0700 Subject: writeback: avoid use-after-free after removing device When a disk is removed, bdi_unregister gets called to stop further writeback and wait for associated delayed work to complete. However, wb_inode_writeback_end() may schedule bandwidth estimation dwork after this has completed, which can result in the timer attempting to access the just freed bdi_writeback. Fix this by checking if the bdi_writeback is alive, similar to when scheduling writeback work. Since this requires wb->work_lock, and wb_inode_writeback_end() may get called from interrupt, switch wb->work_lock to an irqsafe lock. Link: https://lkml.kernel.org/r/20220801155034.3772543-1-khazhy@google.com Fixes: 45a2966fd641 ("writeback: fix bandwidth estimate for spiky workload") Signed-off-by: Khazhismel Kumykov Reviewed-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton --- mm/backing-dev.c | 10 +++++----- mm/page-writeback.c | 6 +++++- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 95550b8fa7fe..de65cb1e5f76 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -260,10 +260,10 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->dwork, timeout); - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); } static void wb_update_bandwidth_workfn(struct work_struct *work) @@ -334,12 +334,12 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); return; } - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d0d466a5c804..032a7bf8d259 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2892,6 +2892,7 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb) static void wb_inode_writeback_end(struct bdi_writeback *wb) { + unsigned long flags; atomic_dec(&wb->writeback_inodes); /* * Make sure estimate of writeback throughput gets updated after @@ -2900,7 +2901,10 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) * that if multiple inodes end writeback at a similar time, they get * batched into one bandwidth update. */ - queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); + spin_lock_irqsave(&wb->work_lock, flags); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); + spin_unlock_irqrestore(&wb->work_lock, flags); } bool __folio_end_writeback(struct folio *folio) -- cgit v1.2.3 From a5d2172180e8f94a8cfc7a7fa0243035629bf8d0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 16 Aug 2022 14:09:06 +0900 Subject: mm/zsmalloc: do not attempt to free IS_ERR handle zsmalloc() now returns ERR_PTR values as handles, which zram accidentally can pass to zs_free(). Another bad scenario is when zcomp_compress() fails - handle has default -ENOMEM value, and zs_free() will try to free that "pointer value". Add the missing check and make sure that zs_free() bails out when ERR_PTR() is passed to it. Link: https://lkml.kernel.org/r/20220816050906.2583956-1-senozhatsky@chromium.org Fixes: c7e6f17b52e9 ("zsmalloc: zs_malloc: return ERR_PTR on failure") Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta , Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 34f784a1604b..907c9b1e1e61 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1487,7 +1487,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) struct size_class *class; enum fullness_group fullness; - if (unlikely(!handle)) + if (IS_ERR_OR_NULL((void *)handle)) return; /* -- cgit v1.2.3 From dd0ff4d12dd284c334f7e9b07f8f335af856ac78 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 19 Aug 2022 17:40:05 +0800 Subject: bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem The vmemmap pages is marked by kmemleak when allocated from memblock. Remove it from kmemleak when freeing the page. Otherwise, when we reuse the page, kmemleak may report such an error and then stop working. kmemleak: Cannot insert 0xffff98fb6eab3d40 into the object search tree (overlaps existing) kmemleak: Kernel memory leak detector disabled kmemleak: Object 0xffff98fb6be00000 (size 335544320): kmemleak: comm "swapper", pid 0, jiffies 4294892296 kmemleak: min_count = 0 kmemleak: count = 0 kmemleak: flags = 0x1 kmemleak: checksum = 0 kmemleak: backtrace: Link: https://lkml.kernel.org/r/20220819094005.2928241-1-liushixin2@huawei.com Fixes: f41f2ed43ca5 (mm: hugetlb: free the vmemmap pages associated with each HugeTLB page) Signed-off-by: Liu Shixin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index f18a631e7479..b1efebfcf94b 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -12,6 +12,7 @@ #include #include #include +#include void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { @@ -33,6 +34,7 @@ void put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); + kmemleak_free_part(page_to_virt(page), PAGE_SIZE); free_reserved_page(page); } } -- cgit v1.2.3 From d26f60703606ab425eee9882b32a1781a8bed74d Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 21 Aug 2022 18:08:53 +0000 Subject: mm/damon/dbgfs: avoid duplicate context directory creation When user tries to create a DAMON context via the DAMON debugfs interface with a name of an already existing context, the context directory creation fails but a new context is created and added in the internal data structure, due to absence of the directory creation success check. As a result, memory could leak and DAMON cannot be turned on. An example test case is as below: # cd /sys/kernel/debug/damon/ # echo "off" > monitor_on # echo paddr > target_ids # echo "abc" > mk_context # echo "abc" > mk_context # echo $$ > abc/target_ids # echo "on" > monitor_on <<< fails Return value of 'debugfs_create_dir()' is expected to be ignored in general, but this is an exceptional case as DAMON feature is depending on the debugfs functionality and it has the potential duplicate name issue. This commit therefore fixes the issue by checking the directory creation failure and immediately return the error in the case. Link: https://lkml.kernel.org/r/20220821180853.2400-1-sj@kernel.org Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts") Signed-off-by: Badari Pulavarty Signed-off-by: SeongJae Park Cc: [ 5.15.x] Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index cb8a7e9926a4..cfdf63132d5a 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -818,6 +818,9 @@ static int dbgfs_mk_context(char *name) return -ENOENT; new_dir = debugfs_create_dir(name, root); + /* Below check is required for a potential duplicated name case */ + if (IS_ERR(new_dir)) + return PTR_ERR(new_dir); dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; new_ctx = dbgfs_new_ctx(); -- cgit v1.2.3 From 3d2f78f08cd8388035ac375e731ec1ac1b79b09d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 23 Aug 2022 18:11:38 -0400 Subject: mm/mprotect: only reference swap pfn page if type match Yu Zhao reported a bug after the commit "mm/swap: Add swp_offset_pfn() to fetch PFN from swap entry" added a check in swp_offset_pfn() for swap type [1]: kernel BUG at include/linux/swapops.h:117! CPU: 46 PID: 5245 Comm: EventManager_De Tainted: G S O L 6.0.0-dbg-DEV #2 RIP: 0010:pfn_swap_entry_to_page+0x72/0xf0 Code: c6 48 8b 36 48 83 fe ff 74 53 48 01 d1 48 83 c1 08 48 8b 09 f6 c1 01 75 7b 66 90 48 89 c1 48 8b 09 f6 c1 01 74 74 5d c3 eb 9e <0f> 0b 48 ba ff ff ff ff 03 00 00 00 eb ae a9 ff 0f 00 00 75 13 48 RSP: 0018:ffffa59e73fabb80 EFLAGS: 00010282 RAX: 00000000ffffffe8 RBX: 0c00000000000000 RCX: ffffcd5440000000 RDX: 1ffffffffff7a80a RSI: 0000000000000000 RDI: 0c0000000000042b RBP: ffffa59e73fabb80 R08: ffff9965ca6e8bb8 R09: 0000000000000000 R10: ffffffffa5a2f62d R11: 0000030b372e9fff R12: ffff997b79db5738 R13: 000000000000042b R14: 0c0000000000042b R15: 1ffffffffff7a80a FS: 00007f549d1bb700(0000) GS:ffff99d3cf680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000440d035b3180 CR3: 0000002243176004 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: change_pte_range+0x36e/0x880 change_p4d_range+0x2e8/0x670 change_protection_range+0x14e/0x2c0 mprotect_fixup+0x1ee/0x330 do_mprotect_pkey+0x34c/0x440 __x64_sys_mprotect+0x1d/0x30 It triggers because pfn_swap_entry_to_page() could be called upon e.g. a genuine swap entry. Fix it by only calling it when it's a write migration entry where the page* is used. [1] https://lore.kernel.org/lkml/CAOUHufaVC2Za-p8m0aiHw6YkheDcrO-C3wRGixwDS32VTS+k1w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220823221138.45602-1-peterx@redhat.com Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive") Signed-off-by: Peter Xu Reported-by: Yu Zhao Tested-by: Yu Zhao Reviewed-by: David Hildenbrand Cc: "Huang, Ying" Cc: Signed-off-by: Andrew Morton --- mm/mprotect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 3a23dde73723..bc6bddd156ca 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -196,10 +196,11 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, pages++; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); - struct page *page = pfn_swap_entry_to_page(entry); pte_t newpte; if (is_writable_migration_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + /* * A protection check is difficult so * just be safe and disable write -- cgit v1.2.3 From 2555283eb40df89945557273121e9393ef9b542b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 31 Aug 2022 19:06:00 +0200 Subject: mm/rmap: Fix anon_vma->degree ambiguity leading to double-reuse anon_vma->degree tracks the combined number of child anon_vmas and VMAs that use the anon_vma as their ->anon_vma. anon_vma_clone() then assumes that for any anon_vma attached to src->anon_vma_chain other than src->anon_vma, it is impossible for it to be a leaf node of the VMA tree, meaning that for such VMAs ->degree is elevated by 1 because of a child anon_vma, meaning that if ->degree equals 1 there are no VMAs that use the anon_vma as their ->anon_vma. This assumption is wrong because the ->degree optimization leads to leaf nodes being abandoned on anon_vma_clone() - an existing anon_vma is reused and no new parent-child relationship is created. So it is possible to reuse an anon_vma for one VMA while it is still tied to another VMA. This is an issue because is_mergeable_anon_vma() and its callers assume that if two VMAs have the same ->anon_vma, the list of anon_vmas attached to the VMAs is guaranteed to be the same. When this assumption is violated, vma_merge() can merge pages into a VMA that is not attached to the corresponding anon_vma, leading to dangling page->mapping pointers that will be dereferenced during rmap walks. Fix it by separately tracking the number of child anon_vmas and the number of VMAs using the anon_vma as their ->anon_vma. Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy") Cc: stable@kernel.org Acked-by: Michal Hocko Acked-by: Vlastimil Babka Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- mm/rmap.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index edc06c52bc82..93d5a6f793d2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -93,7 +93,8 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); - anon_vma->degree = 1; /* Reference for first vma */ + anon_vma->num_children = 0; + anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called @@ -201,6 +202,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; + anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } @@ -210,8 +212,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; + anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } @@ -296,19 +297,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) anon_vma_chain_link(dst, avc, anon_vma); /* - * Reuse existing anon_vma if its degree lower than two, - * that means it has no vma and only one anon_vma child. + * Reuse existing anon_vma if it has no vma and only one + * anon_vma child. * - * Do not choose parent anon_vma, otherwise first child - * will always reuse it. Root anon_vma is never reused: + * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && - anon_vma != src->anon_vma && anon_vma->degree < 2) + anon_vma->num_children < 2 && + anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) - dst->anon_vma->degree++; + dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; @@ -358,6 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; + anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -378,7 +380,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma->parent->degree++; + anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; @@ -410,7 +412,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { - anon_vma->parent->degree--; + anon_vma->parent->num_children--; continue; } @@ -418,7 +420,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } if (vma->anon_vma) { - vma->anon_vma->degree--; + vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared @@ -436,7 +438,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - VM_WARN_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->num_children); + VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); -- cgit v1.2.3 From 0495e337b7039191dfce6e03f5f830454b1fae6b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 12 Aug 2022 14:30:33 -0400 Subject: mm/slab_common: Deleting kobject in kmem_cache_destroy() without holding slab_mutex/cpu_hotplug_lock A circular locking problem is reported by lockdep due to the following circular locking dependency. +--> cpu_hotplug_lock --> slab_mutex --> kn->active --+ | | +-----------------------------------------------------+ The forward cpu_hotplug_lock ==> slab_mutex ==> kn->active dependency happens in kmem_cache_destroy(): cpus_read_lock(); mutex_lock(&slab_mutex); ==> sysfs_slab_unlink() ==> kobject_del() ==> kernfs_remove() ==> __kernfs_remove() ==> kernfs_drain(): rwsem_acquire(&kn->dep_map, ...); The backward kn->active ==> cpu_hotplug_lock dependency happens in kernfs_fop_write_iter(): kernfs_get_active(); ==> slab_attr_store() ==> cpu_partial_store() ==> flush_all(): cpus_read_lock() One way to break this circular locking chain is to avoid holding cpu_hotplug_lock and slab_mutex while deleting the kobject in sysfs_slab_unlink() which should be equivalent to doing a write_lock and write_unlock pair of the kn->active virtual lock. Since the kobject structures are not protected by slab_mutex or the cpu_hotplug_lock, we can certainly release those locks before doing the delete operation. Move sysfs_slab_unlink() and sysfs_slab_release() to the newly created kmem_cache_release() and call it outside the slab_mutex & cpu_hotplug_lock critical sections. There will be a slight delay in the deletion of sysfs files if kmem_cache_release() is called indirectly from a work function. Fixes: 5a836bf6b09f ("mm: slub: move flush_cpu_slab() invocations __free_slab() invocations out of IRQ context") Signed-off-by: Waiman Long Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Roman Gushchin Acked-by: David Rientjes Link: https://lore.kernel.org/all/YwOImVd+nRUsSAga@hyeyoo/ Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 17996649cfe3..07b948288f84 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -392,6 +392,28 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, } EXPORT_SYMBOL(kmem_cache_create); +#ifdef SLAB_SUPPORTS_SYSFS +/* + * For a given kmem_cache, kmem_cache_destroy() should only be called + * once or there will be a use-after-free problem. The actual deletion + * and release of the kobject does not need slab_mutex or cpu_hotplug_lock + * protection. So they are now done without holding those locks. + * + * Note that there will be a slight delay in the deletion of sysfs files + * if kmem_cache_release() is called indrectly from a work function. + */ +static void kmem_cache_release(struct kmem_cache *s) +{ + sysfs_slab_unlink(s); + sysfs_slab_release(s); +} +#else +static void kmem_cache_release(struct kmem_cache *s) +{ + slab_kmem_cache_release(s); +} +#endif + static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { LIST_HEAD(to_destroy); @@ -418,11 +440,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) list_for_each_entry_safe(s, s2, &to_destroy, list) { debugfs_slab_release(s); kfence_shutdown_cache(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_release(s); -#else - slab_kmem_cache_release(s); -#endif + kmem_cache_release(s); } } @@ -437,20 +455,11 @@ static int shutdown_cache(struct kmem_cache *s) list_del(&s->list); if (s->flags & SLAB_TYPESAFE_BY_RCU) { -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_unlink(s); -#endif list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { kfence_shutdown_cache(s); debugfs_slab_release(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_unlink(s); - sysfs_slab_release(s); -#else - slab_kmem_cache_release(s); -#endif } return 0; @@ -465,14 +474,16 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { + int refcnt; + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); mutex_lock(&slab_mutex); - s->refcount--; - if (s->refcount) + refcnt = --s->refcount; + if (refcnt) goto out_unlock; WARN(shutdown_cache(s), @@ -481,6 +492,8 @@ void kmem_cache_destroy(struct kmem_cache *s) out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); + if (!refcnt && !(s->flags & SLAB_TYPESAFE_BY_RCU)) + kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3 From 8782fb61cc848364e1e1599d76d3c9dd58a1cc06 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Fri, 2 Sep 2022 12:26:12 +0100 Subject: mm: pagewalk: Fix race between unmap and page walker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mmap lock protects the page walker from changes to the page tables during the walk. However a read lock is insufficient to protect those areas which don't have a VMA as munmap() detaches the VMAs before downgrading to a read lock and actually tearing down PTEs/page tables. For users of walk_page_range() the solution is to simply call pte_hole() immediately without checking the actual page tables when a VMA is not present. We now never call __walk_page_range() without a valid vma. For walk_page_range_novma() the locking requirements are tightened to require the mmap write lock to be taken, and then walking the pgd directly with 'no_vma' set. This in turn means that all page walkers either have a valid vma, or it's that special 'novma' case for page table debugging. As a result, all the odd '(!walk->vma && !walk->no_vma)' tests can be removed. Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Reported-by: Jann Horn Signed-off-by: Steven Price Cc: Vlastimil Babka Cc: Thomas Hellström Cc: Konstantin Khlebnikov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 21 ++++++++++++--------- mm/ptdump.c | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b3db11a4d1d..fa7a3d21a751 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -110,7 +110,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { + if (pmd_none(*pmd)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) @@ -171,7 +171,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, do { again: next = pud_addr_end(addr, end); - if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { + if (pud_none(*pud)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) @@ -366,19 +366,19 @@ static int __walk_page_range(unsigned long start, unsigned long end, struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; - if (vma && ops->pre_vma) { + if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); if (err) return err; } - if (vma && is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma)) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); - if (vma && ops->post_vma) + if (ops->post_vma) ops->post_vma(walk); return err; @@ -450,9 +450,13 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; + if (ops->pte_hole) + err = ops->pte_hole(start, next, -1, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); + if (ops->pte_hole) + err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); @@ -470,9 +474,8 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } if (err < 0) break; - } - if (walk.vma || walk.ops->pte_hole) err = __walk_page_range(start, next, &walk); + } if (err) break; } while (start = next, start < end); @@ -501,9 +504,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, if (start >= end || !walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + mmap_assert_write_locked(walk.mm); - return __walk_page_range(start, end, &walk); + return walk_pgd_range(start, end, &walk); } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, diff --git a/mm/ptdump.c b/mm/ptdump.c index eea3d28d173c..8adab455a68b 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -152,13 +152,13 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) { const struct ptdump_range *range = st->range; - mmap_read_lock(mm); + mmap_write_lock(mm); while (range->start != range->end) { walk_page_range_novma(mm, range->start, range->end, &ptdump_ops, pgd, st); range++; } - mmap_read_unlock(mm); + mmap_write_unlock(mm); /* Flush out the last page */ st->note_page(st, 0, -1, 0); -- cgit v1.2.3