From f5e31a196edcd1f1bb44f26b6f9299b9a5b9b3c4 Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Sun, 2 Nov 2025 17:24:56 +0900 Subject: mm: swap: remove duplicate nr_swap_pages decrement in get_swap_page_of_type() After commit 4f78252da887, nr_swap_pages is decremented in swap_range_alloc(). Since cluster_alloc_swap_entry() calls swap_range_alloc() internally, the decrement in get_swap_page_of_type() causes double-decrementing. As a representative userspace-visible runtime example of the impact, /proc/meminfo reports increasingly inaccurate SwapFree values. The discrepancy grows with each swap allocation, and during hibernation when large amounts of memory are written to swap, the reported value can deviate significantly from actual available swap space, misleading users and monitoring tools. Remove the duplicate decrement. Link: https://lkml.kernel.org/r/20251102082456.79807-1-youngjun.park@lge.com Fixes: 4f78252da887 ("mm: swap: move nr_swap_pages counter decrement from folio_alloc_swap() to swap_range_alloc()") Signed-off-by: Youngjun Park Acked-by: Chris Li Reviewed-by: Barry Song Reviewed-by: Kairui Song Acked-by: Nhat Pham Cc: Baoquan He Cc: Kemeng Shi Cc: [6.17+] Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 10760240a3a2..a1b4b9d80e3b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2005,10 +2005,8 @@ swp_entry_t get_swap_page_of_type(int type) local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); local_unlock(&percpu_swap_cluster.lock); - if (offset) { + if (offset) entry = swp_entry(si->type, offset); - atomic_long_dec(&nr_swap_pages); - } } put_swap_device(si); } -- cgit v1.2.3 From de8798965fd0d9a6c47fc2ac57767ec32de12b49 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Wed, 12 Nov 2025 20:20:34 +0530 Subject: mm/memfd: fix information leak in hugetlb folios When allocating hugetlb folios for memfd, three initialization steps are missing: 1. Folios are not zeroed, leading to kernel memory disclosure to userspace 2. Folios are not marked uptodate before adding to page cache 3. hugetlb_fault_mutex is not taken before hugetlb_add_to_page_cache() The memfd allocation path bypasses the normal page fault handler (hugetlb_no_page) which would handle all of these initialization steps. This is problematic especially for udmabuf use cases where folios are pinned and directly accessed by userspace via DMA. Fix by matching the initialization pattern used in hugetlb_no_page(): - Zero the folio using folio_zero_user() which is optimized for huge pages - Mark it uptodate with folio_mark_uptodate() - Take hugetlb_fault_mutex before adding to page cache to prevent races The folio_zero_user() change also fixes a potential security issue where uninitialized kernel memory could be disclosed to userspace through read() or mmap() operations on the memfd. Link: https://lkml.kernel.org/r/20251112145034.2320452-1-kartikey406@gmail.com Fixes: 89c1905d9c14 ("mm/gup: introduce memfd_pin_folios() for pinning memfd folios") Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+f64019ba229e3a5c411b@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/20251112031631.2315651-1-kartikey406@gmail.com/ [v1] Closes: https://syzkaller.appspot.com/bug?extid=f64019ba229e3a5c411b Suggested-by: Oscar Salvador Suggested-by: David Hildenbrand Tested-by: syzbot+f64019ba229e3a5c411b@syzkaller.appspotmail.com Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Red Hat) Acked-by: Hugh Dickins Cc: Vivek Kasireddy Cc: Jason Gunthorpe Cc: Jason Gunthorpe (v2) Cc: Christoph Hellwig (v6) Cc: Dave Airlie Cc: Gerd Hoffmann Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'mm') diff --git a/mm/memfd.c b/mm/memfd.c index 1d109c1acf21..a405eaa451ee 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -96,9 +96,36 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) NULL, gfp_mask); if (folio) { + u32 hash; + + /* + * Zero the folio to prevent information leaks to userspace. + * Use folio_zero_user() which is optimized for huge/gigantic + * pages. Pass 0 as addr_hint since this is not a faulting path + * and we don't have a user virtual address yet. + */ + folio_zero_user(folio, 0); + + /* + * Mark the folio uptodate before adding to page cache, + * as required by filemap.c and other hugetlb paths. + */ + __folio_mark_uptodate(folio); + + /* + * Serialize hugepage allocation and instantiation to prevent + * races with concurrent allocations, as required by all other + * callers of hugetlb_add_to_page_cache(). + */ + hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + if (err) { folio_put(folio); goto err_unresv; -- cgit v1.2.3 From 270065f514c5da7b3deb4f6a96cf93e96551cf35 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 11 Nov 2025 16:56:05 -0500 Subject: mm/mmap_lock: reset maple state on lock_vma_under_rcu() retry The retry in lock_vma_under_rcu() drops the rcu read lock before reacquiring the lock and trying again. This may cause a use-after-free if the maple node the maple state was using was freed. The maple state is protected by the rcu read lock. When the lock is dropped, the state cannot be reused as it tracks pointers to objects that may be freed during the time where the lock was not held. Any time the rcu read lock is dropped, the maple state must be invalidated. Resetting the address and state to MA_START is the safest course of action, which will result in the next operation starting from the top of the tree. Prior to commit 0b16f8bed19c ("mm: change vma_start_read() to drop RCU lock on failure"), vma_start_read() would drop rcu read lock and return NULL, so the retry would not have happened. However, now that vma_start_read() drops rcu read lock on failure followed by a retry, we may end up using a freed maple tree node cached in the maple state. [surenb@google.com: changelog alteration] Link: https://lkml.kernel.org/r/CAJuCfpEWMD-Z1j=nPYHcQW4F7E2Wka09KTXzGv7VE7oW1S8hcw@mail.gmail.com Link: https://lkml.kernel.org/r/20251111215605.1721380-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Fixes: 0b16f8bed19c ("mm: change vma_start_read() to drop RCU lock on failure") Reported-by: syzbot+131f9eb2b5807573275c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=131f9eb2b5807573275c Acked-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Cc: Jann Horn Cc: Shakeel Butt Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/mmap_lock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 0a0db5849b8e..42e3dde73e74 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -241,6 +241,7 @@ retry: if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ + mas_set(&mas, address); goto retry; } -- cgit v1.2.3 From cff47b9e39a6abf03dde5f4f156f841b0c54bba0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 19 Nov 2025 23:53:02 +0000 Subject: mm/huge_memory: fix NULL pointer deference when splitting folio Commit c010d47f107f ("mm: thp: split huge page to any lower order pages") introduced an early check on the folio's order via mapping->flags before proceeding with the split work. This check introduced a bug: for shmem folios in the swap cache and truncated folios, the mapping pointer can be NULL. Accessing mapping->flags in this state leads directly to a NULL pointer dereference. This commit fixes the issue by moving the check for mapping != NULL before any attempt to access mapping->flags. Link: https://lkml.kernel.org/r/20251119235302.24773-1-richard.weiyang@gmail.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Wei Yang Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Baolin Wang Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2a521e5d68..6cba1cb14b23 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3619,6 +3619,16 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL; + /* + * Folios that just got truncated cannot get split. Signal to the + * caller that there was a race. + * + * TODO: this will also currently refuse shmem folios that are in the + * swapcache. + */ + if (!is_anon && !folio->mapping) + return -EBUSY; + if (new_order >= folio_order(folio)) return -EINVAL; @@ -3659,18 +3669,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order, gfp_t gfp; mapping = folio->mapping; - - /* Truncated ? */ - /* - * TODO: add support for large shmem folio in swap cache. - * When shmem is in swap cache, mapping is NULL and - * folio_test_swapcache() is true. - */ - if (!mapping) { - ret = -EBUSY; - goto out; - } - min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { ret = -EINVAL; -- cgit v1.2.3 From 7c9580f44f90f7a4c11fc7831efe323ebe446091 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Thu, 20 Nov 2025 16:14:11 +0000 Subject: mm/filemap: fix logic around SIGBUS in filemap_map_pages() Chris noticed that filemap_map_pages() calculates can_map_large only once for the first page in the fault around range. The value is not valid for the following pages in the range and must be recalculated. Instead of recalculating can_map_large on each iteration, pass down file_end to filemap_map_folio_range() and let it make the decision on what can be mapped. Link: https://lkml.kernel.org/r/20251120161411.859078-1-kirill@shutemov.name Fixes: 74207de2ba10 ("mm/memory: do not populate page table entries beyond i_size")h Signed-off-by: Kiryl Shutsemau Reported-by: Chris Mason Reviewed-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Baolin Wang Cc: Chris Mason Cc: Christian Brauner Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/filemap.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2f1e7e283a51..024b71da5224 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3682,8 +3682,9 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned short *mmap_miss, - bool can_map_large) + pgoff_t file_end) { + struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3692,12 +3693,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, unsigned long addr0; /* - * Map the large folio fully where possible. + * Map the large folio fully where possible: * - * The folio must not cross VMA or page table boundary. + * - The folio is fully within size of the file or belong + * to shmem/tmpfs; + * - The folio doesn't cross VMA boundary; + * - The folio doesn't cross page table boundary; */ addr0 = addr - start * PAGE_SIZE; - if (can_map_large && folio_within_vma(folio, vmf->vma) && + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + folio_within_vma(folio, vmf->vma) && (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { vmf->pte -= start; page -= start; @@ -3812,7 +3817,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; unsigned short mmap_miss = 0, mmap_miss_saved; - bool can_map_large; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3823,16 +3827,14 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end_pgoff = min(end_pgoff, file_end); /* - * Do not allow to map with PTEs beyond i_size and with PMD - * across i_size to preserve SIGBUS semantics. + * Do not allow to map with PMD across i_size to preserve + * SIGBUS semantics. * * Make an exception for shmem/tmpfs that for long time * intentionally mapped with PMDs across i_size. */ - can_map_large = shmem_mapping(mapping) || - file_end >= folio_next_index(folio); - - if (can_map_large && filemap_map_pmd(vmf, folio, start_pgoff)) { + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3861,8 +3863,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss, - can_map_large); + nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); -- cgit v1.2.3