From 8a1968bd997f45a9b11aefeabdd1232e1b6c7184 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Jan 2026 00:11:21 +0800 Subject: mm/shmem, swap: fix race of truncate and swap entry split The helper for shmem swap freeing is not handling the order of swap entries correctly. It uses xa_cmpxchg_irq to erase the swap entry, but it gets the entry order before that using xa_get_order without lock protection, and it may get an outdated order value if the entry is split or changed in other ways after the xa_get_order and before the xa_cmpxchg_irq. And besides, the order could grow and be larger than expected, and cause truncation to erase data beyond the end border. For example, if the target entry and following entries are swapped in or freed, then a large folio was added in place and swapped out, using the same entry, the xa_cmpxchg_irq will still succeed, it's very unlikely to happen though. To fix that, open code the Xarray cmpxchg and put the order retrieval and value checking in the same critical section. Also, ensure the order won't exceed the end border, skip it if the entry goes across the border. Skipping large swap entries crosses the end border is safe here. Shmem truncate iterates the range twice, in the first iteration, find_lock_entries already filtered such entries, and shmem will swapin the entries that cross the end border and partially truncate the folio (split the folio or at least zero part of it). So in the second loop here, if we see a swap entry that crosses the end order, it must at least have its content erased already. I observed random swapoff hangs and kernel panics when stress testing ZSWAP with shmem. After applying this patch, all problems are gone. Link: https://lkml.kernel.org/r/20260120-shmem-swap-fix-v3-1-3d33ebfbc057@tencent.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Kairui Song Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Hugh Dickins Cc: Kemeng Shi Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ec6c01378e9d..6c3485d24d66 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -962,17 +962,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) * being freed). */ static long shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) + pgoff_t index, pgoff_t end, void *radswap) { - int order = xa_get_order(&mapping->i_pages, index); - void *old; + XA_STATE(xas, &mapping->i_pages, index); + unsigned int nr_pages = 0; + pgoff_t base; + void *entry; - old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); - if (old != radswap) - return 0; - free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + xas_lock_irq(&xas); + entry = xas_load(&xas); + if (entry == radswap) { + nr_pages = 1 << xas_get_order(&xas); + base = round_down(xas.xa_index, nr_pages); + if (base < index || base + nr_pages - 1 > end) + nr_pages = 0; + else + xas_store(&xas, NULL); + } + xas_unlock_irq(&xas); + + if (nr_pages) + free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages); - return 1 << order; + return nr_pages; } /* @@ -1124,8 +1136,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += shmem_free_swap(mapping, - indices[i], folio); + nr_swaps_freed += shmem_free_swap(mapping, indices[i], + end - 1, folio); continue; } @@ -1191,12 +1203,23 @@ whole_folios: folio = fbatch.folios[i]; if (xa_is_value(folio)) { + int order; long swaps_freed; if (unfalloc) continue; - swaps_freed = shmem_free_swap(mapping, indices[i], folio); + swaps_freed = shmem_free_swap(mapping, indices[i], + end - 1, folio); if (!swaps_freed) { + /* + * If found a large swap entry cross the end border, + * skip it as the truncate_inode_partial_folio above + * should have at least zerod its content once. + */ + order = shmem_confirm_swap(mapping, indices[i], + radix_to_swp_entry(folio)); + if (order > 0 && indices[i] + (1 << order) > end) + continue; /* Swap was replaced by page: retry */ index = indices[i]; break; -- cgit v1.2.3 From 9b47d4eea3f7c1f620e95bda1d6221660bde7d7b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 13 Jan 2026 20:15:15 +0100 Subject: mm/kasan: fix KASAN poisoning in vrealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A KASAN warning can be triggered when vrealloc() changes the requested size to a value that is not aligned to KASAN_GRANULE_SIZE. ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1 at mm/kasan/shadow.c:174 kasan_unpoison+0x40/0x48 ... pc : kasan_unpoison+0x40/0x48 lr : __kasan_unpoison_vmalloc+0x40/0x68 Call trace: kasan_unpoison+0x40/0x48 (P) vrealloc_node_align_noprof+0x200/0x320 bpf_patch_insn_data+0x90/0x2f0 convert_ctx_accesses+0x8c0/0x1158 bpf_check+0x1488/0x1900 bpf_prog_load+0xd20/0x1258 __sys_bpf+0x96c/0xdf0 __arm64_sys_bpf+0x50/0xa0 invoke_syscall+0x90/0x160 Introduce a dedicated kasan_vrealloc() helper that centralizes KASAN handling for vmalloc reallocations. The helper accounts for KASAN granule alignment when growing or shrinking an allocation and ensures that partial granules are handled correctly. Use this helper from vrealloc_node_align_noprof() to fix poisoning logic. [ryabinin.a.a@gmail.com: move kasan_enabled() check, fix build] Link: https://lkml.kernel.org/r/20260119144509.32767-1-ryabinin.a.a@gmail.com Link: https://lkml.kernel.org/r/20260113191516.31015-1-ryabinin.a.a@gmail.com Fixes: d699440f58ce ("mm: fix vrealloc()'s KASAN poisoning logic") Signed-off-by: Andrey Ryabinin Reported-by: Maciej Żenczykowski Reported-by: Closes: https://lkml.kernel.org/r/CANP3RGeuRW53vukDy7WDO3FiVgu34-xVJYkfpm08oLO3odYFrA@mail.gmail.com Reviewed-by: Andrey Konovalov Tested-by: Maciej Wieczor-Retman Cc: Alexander Potapenko Cc: Dmitriy Vyukov Cc: Dmitry Vyukov Cc: Uladzislau Rezki Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 14 ++++++++++++++ mm/kasan/common.c | 21 +++++++++++++++++++++ mm/vmalloc.c | 7 ++----- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9c6ac4b62eb9..338a1921a50a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -641,6 +641,17 @@ kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, __kasan_unpoison_vmap_areas(vms, nr_vms, flags); } +void __kasan_vrealloc(const void *start, unsigned long old_size, + unsigned long new_size); + +static __always_inline void kasan_vrealloc(const void *start, + unsigned long old_size, + unsigned long new_size) +{ + if (kasan_enabled()) + __kasan_vrealloc(start, old_size, new_size); +} + #else /* CONFIG_KASAN_VMALLOC */ static inline void kasan_populate_early_vm_area_shadow(void *start, @@ -670,6 +681,9 @@ kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, kasan_vmalloc_flags_t flags) { } +static inline void kasan_vrealloc(const void *start, unsigned long old_size, + unsigned long new_size) { } + #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index ed489a14dddf..b7d05c2a6d93 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -606,4 +606,25 @@ void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, __kasan_unpoison_vmalloc(addr, size, flags | KASAN_VMALLOC_KEEP_TAG); } } + +void __kasan_vrealloc(const void *addr, unsigned long old_size, + unsigned long new_size) +{ + if (new_size < old_size) { + kasan_poison_last_granule(addr, new_size); + + new_size = round_up(new_size, KASAN_GRANULE_SIZE); + old_size = round_up(old_size, KASAN_GRANULE_SIZE); + if (new_size < old_size) + __kasan_poison_vmalloc(addr + new_size, + old_size - new_size); + } else if (new_size > old_size) { + old_size = round_down(old_size, KASAN_GRANULE_SIZE); + __kasan_unpoison_vmalloc(addr + old_size, + new_size - old_size, + KASAN_VMALLOC_PROT_NORMAL | + KASAN_VMALLOC_VM_ALLOC | + KASAN_VMALLOC_KEEP_TAG); + } +} #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 628f96e83b11..e286c2d2068c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4322,7 +4322,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); vm->requested_size = size; - kasan_poison_vmalloc(p + size, old_size - size); + kasan_vrealloc(p, old_size, size); return (void *)p; } @@ -4330,16 +4330,13 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align * We already have the bytes available in the allocation; use them. */ if (size <= alloced_size) { - kasan_unpoison_vmalloc(p + old_size, size - old_size, - KASAN_VMALLOC_PROT_NORMAL | - KASAN_VMALLOC_VM_ALLOC | - KASAN_VMALLOC_KEEP_TAG); /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during * realloc shrink time. */ vm->requested_size = size; + kasan_vrealloc(p, old_size, size); return (void *)p; } -- cgit v1.2.3 From a0f3c0845a4ff68d403c568266d17e9cc553e561 Mon Sep 17 00:00:00 2001 From: "robin.kuo" Date: Fri, 16 Jan 2026 14:25:00 +0800 Subject: mm, swap: restore swap_space attr aviod kernel panic commit 8b47299a411a ("mm, swap: mark swap address space ro and add context debug check") made the swap address space read-only. It may lead to kernel panic if arch_prepare_to_swap returns a failure under heavy memory pressure as follows, el1_abort+0x40/0x64 el1h_64_sync_handler+0x48/0xcc el1h_64_sync+0x84/0x88 errseq_set+0x4c/0xb8 (P) __filemap_set_wb_err+0x20/0xd0 shrink_folio_list+0xc20/0x11cc evict_folios+0x1520/0x1be4 try_to_shrink_lruvec+0x27c/0x3dc shrink_one+0x9c/0x228 shrink_node+0xb3c/0xeac do_try_to_free_pages+0x170/0x4f0 try_to_free_pages+0x334/0x534 __alloc_pages_direct_reclaim+0x90/0x158 __alloc_pages_slowpath+0x334/0x588 __alloc_frozen_pages_noprof+0x224/0x2fc __folio_alloc_noprof+0x14/0x64 vma_alloc_zeroed_movable_folio+0x34/0x44 do_pte_missing+0xad4/0x1040 handle_mm_fault+0x4a4/0x790 do_page_fault+0x288/0x5f8 do_translation_fault+0x38/0x54 do_mem_abort+0x54/0xa8 Restore swap address space as not ro to avoid the panic. Link: https://lkml.kernel.org/r/20260116062535.306453-2-robin.kuo@mediatek.com Fixes: 8b47299a411a ("mm, swap: mark swap address space ro and add context debug check") Signed-off-by: robin.kuo Reviewed-by: Andrew Morton Cc: andrew.yang Cc: AngeloGiaocchino Del Regno Cc: Baoquan He Cc: Barry Song Cc: Chinwen Chang Cc: Chris Li Cc: Kairui Song Cc: Kairui Song Cc: Kemeng Shi Cc: Mathias Brugger Cc: Nhat Pham Cc: Qun-wei Lin Cc: Signed-off-by: Andrew Morton --- mm/swap.h | 2 +- mm/swap_state.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index d034c13d8dd2..1bd466da3039 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -198,7 +198,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ -extern struct address_space swap_space __ro_after_init; +extern struct address_space swap_space __read_mostly; static inline struct address_space *swap_address_space(swp_entry_t entry) { return &swap_space; diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f97c6ae70a2..44d228982521 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,8 +37,7 @@ static const struct address_space_operations swap_aops = { #endif }; -/* Set swap_space as read only as swap cache is handled by swap table */ -struct address_space swap_space __ro_after_init = { +struct address_space swap_space __read_mostly = { .a_ops = &swap_aops, }; -- cgit v1.2.3 From a148a2040191b12b45b82cb29c281cb3036baf90 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Tue, 20 Jan 2026 16:22:33 -0700 Subject: mm/memory-failure: fix missing ->mf_stats count in hugetlb poison When a newly poisoned subpage ends up in an already poisoned hugetlb folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats is not. Fix the inconsistency by designating action_result() to update them both. While at it, define __get_huge_page_for_hwpoison() return values in terms of symbol names for better readibility. Also rename folio_set_hugetlb_hwpoison() to hugetlb_update_hwpoison() since the function does more than the conventional bit setting and the fact three possible return values are expected. Link: https://lkml.kernel.org/r/20260120232234.3462258-1-jane.chu@oracle.com Fixes: 18f41fa616ee ("mm: memory-failure: bump memory failure stats to pglist_data") Signed-off-by: Jane Chu Acked-by: Miaohe Lin Cc: Chris Mason Cc: David Hildenbrand Cc: David Rientjes Cc: Jiaqi Yan Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: William Roche Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 93 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 37 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c80c2907da33..473204359e1f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1883,12 +1883,22 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) return count; } -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) +#define MF_HUGETLB_FREED 0 /* freed hugepage */ +#define MF_HUGETLB_IN_USED 1 /* in-use hugepage */ +#define MF_HUGETLB_NON_HUGEPAGE 2 /* not a hugepage */ +#define MF_HUGETLB_FOLIO_PRE_POISONED 3 /* folio already poisoned */ +#define MF_HUGETLB_PAGE_PRE_POISONED 4 /* exact page already poisoned */ +#define MF_HUGETLB_RETRY 5 /* hugepage is busy, retry */ +/* + * Set hugetlb folio as hwpoisoned, update folio private raw hwpoison list + * to keep track of the poisoned pages. + */ +static int hugetlb_update_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; struct raw_hwp_page *p; - int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; + int ret = folio_test_set_hwpoison(folio) ? MF_HUGETLB_FOLIO_PRE_POISONED : 0; /* * Once the hwpoison hugepage has lost reliable raw error info, @@ -1896,20 +1906,17 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) * so skip to add additional raw error info. */ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) - return -EHWPOISON; + return MF_HUGETLB_FOLIO_PRE_POISONED; head = raw_hwp_list_head(folio); llist_for_each_entry(p, head->first, node) { if (p->page == page) - return -EHWPOISON; + return MF_HUGETLB_PAGE_PRE_POISONED; } raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); if (raw_hwp) { raw_hwp->page = page; llist_add(&raw_hwp->node, head); - /* the first error event will be counted in action_result(). */ - if (ret) - num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -1957,42 +1964,39 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) /* * Called from hugetlb code with hugetlb_lock held. - * - * Return values: - * 0 - free hugepage - * 1 - in-use hugepage - * 2 - not a hugepage - * -EBUSY - the hugepage is busy (try to retry) - * -EHWPOISON - the hugepage is already hwpoisoned */ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); struct folio *folio = page_folio(page); - int ret = 2; /* fallback to normal page handling */ bool count_increased = false; + int ret, rc; - if (!folio_test_hugetlb(folio)) + if (!folio_test_hugetlb(folio)) { + ret = MF_HUGETLB_NON_HUGEPAGE; goto out; - - if (flags & MF_COUNT_INCREASED) { - ret = 1; + } else if (flags & MF_COUNT_INCREASED) { + ret = MF_HUGETLB_IN_USED; count_increased = true; } else if (folio_test_hugetlb_freed(folio)) { - ret = 0; + ret = MF_HUGETLB_FREED; } else if (folio_test_hugetlb_migratable(folio)) { - ret = folio_try_get(folio); - if (ret) + if (folio_try_get(folio)) { + ret = MF_HUGETLB_IN_USED; count_increased = true; + } else { + ret = MF_HUGETLB_FREED; + } } else { - ret = -EBUSY; + ret = MF_HUGETLB_RETRY; if (!(flags & MF_NO_RETRY)) goto out; } - if (folio_set_hugetlb_hwpoison(folio, page)) { - ret = -EHWPOISON; + rc = hugetlb_update_hwpoison(folio, page); + if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) { + ret = rc; goto out; } @@ -2017,10 +2021,16 @@ out: * with basic operations like hugepage allocation/free/demotion. * So some of prechecks for hwpoison (pinning, and testing/setting * PageHWPoison) should be done in single hugetlb_lock range. + * Returns: + * 0 - not hugetlb, or recovered + * -EBUSY - not recovered + * -EOPNOTSUPP - hwpoison_filter'ed + * -EHWPOISON - folio or exact page already poisoned + * -EFAULT - kill_accessing_process finds current->mm null */ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { - int res; + int res, rv; struct page *p = pfn_to_page(pfn); struct folio *folio; unsigned long page_flags; @@ -2029,22 +2039,31 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 1; retry: res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); - if (res == 2) { /* fallback to normal page handling */ + switch (res) { + case MF_HUGETLB_NON_HUGEPAGE: /* fallback to normal page handling */ *hugetlb = 0; return 0; - } else if (res == -EHWPOISON) { - if (flags & MF_ACTION_REQUIRED) { - folio = page_folio(p); - res = kill_accessing_process(current, folio_pfn(folio), flags); - } - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); - return res; - } else if (res == -EBUSY) { + case MF_HUGETLB_RETRY: if (!(flags & MF_NO_RETRY)) { flags |= MF_NO_RETRY; goto retry; } return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); + case MF_HUGETLB_FOLIO_PRE_POISONED: + case MF_HUGETLB_PAGE_PRE_POISONED: + rv = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) { + folio = page_folio(p); + rv = kill_accessing_process(current, folio_pfn(folio), flags); + } + if (res == MF_HUGETLB_PAGE_PRE_POISONED) + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); + else + action_result(pfn, MF_MSG_HUGE, MF_FAILED); + return rv; + default: + WARN_ON((res != MF_HUGETLB_FREED) && (res != MF_HUGETLB_IN_USED)); + break; } folio = page_folio(p); @@ -2055,7 +2074,7 @@ retry: if (migratable_cleared) folio_set_hugetlb_migratable(folio); folio_unlock(folio); - if (res == 1) + if (res == MF_HUGETLB_IN_USED) folio_put(folio); return -EOPNOTSUPP; } @@ -2064,7 +2083,7 @@ retry: * Handling free hugepage. The possible race with hugepage allocation * or demotion can be prevented by PageHWPoison flag. */ - if (res == 0) { + if (res == MF_HUGETLB_FREED) { folio_unlock(folio); if (__page_handle_poison(p) > 0) { page_ref_inc(p); -- cgit v1.2.3 From 057a6f2632c956483e2b2628477f0fcd1cd8a844 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Tue, 20 Jan 2026 16:22:34 -0700 Subject: mm/memory-failure: teach kill_accessing_process to accept hugetlb tail page pfn When a hugetlb folio is being poisoned again, try_memory_failure_hugetlb() passed head pfn to kill_accessing_process(), that is not right. The precise pfn of the poisoned page should be used in order to determine the precise vaddr as the SIGBUS payload. This issue has already been taken care of in the normal path, that is, hwpoison_user_mappings(), see [1][2]. Further more, for [3] to work correctly in the hugetlb repoisoning case, it's essential to inform VM the precise poisoned page, not the head page. [1] https://lkml.kernel.org/r/20231218135837.3310403-1-willy@infradead.org [2] https://lkml.kernel.org/r/20250224211445.2663312-1-jane.chu@oracle.com [3] https://lore.kernel.org/lkml/20251116013223.1557158-1-jiaqiyan@google.com/ Link: https://lkml.kernel.org/r/20260120232234.3462258-2-jane.chu@oracle.com Signed-off-by: Jane Chu Reviewed-by: Liam R. Howlett Acked-by: Miaohe Lin Cc: Chris Mason Cc: David Hildenbrand Cc: David Rientjes Cc: Jiaqi Yan Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: William Roche Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 473204359e1f..cf0d526e6d41 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -692,6 +692,8 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, unsigned long poisoned_pfn, struct to_kill *tk) { unsigned long pfn = 0; + unsigned long hwpoison_vaddr; + unsigned long mask; if (pte_present(pte)) { pfn = pte_pfn(pte); @@ -702,10 +704,12 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, pfn = softleaf_to_pfn(entry); } - if (!pfn || pfn != poisoned_pfn) + mask = ~((1UL << (shift - PAGE_SHIFT)) - 1); + if (!pfn || pfn != (poisoned_pfn & mask)) return 0; - set_to_kill(tk, addr, shift); + hwpoison_vaddr = addr + ((poisoned_pfn - pfn) << PAGE_SHIFT); + set_to_kill(tk, hwpoison_vaddr, shift); return 1; } @@ -2052,10 +2056,8 @@ retry: case MF_HUGETLB_FOLIO_PRE_POISONED: case MF_HUGETLB_PAGE_PRE_POISONED: rv = -EHWPOISON; - if (flags & MF_ACTION_REQUIRED) { - folio = page_folio(p); - rv = kill_accessing_process(current, folio_pfn(folio), flags); - } + if (flags & MF_ACTION_REQUIRED) + rv = kill_accessing_process(current, pfn, flags); if (res == MF_HUGETLB_PAGE_PRE_POISONED) action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); else -- cgit v1.2.3 From d54887e8e144514768df056878d27346f5b71093 Mon Sep 17 00:00:00 2001 From: Viacheslav Bocharov Date: Tue, 20 Jan 2026 11:21:59 +0300 Subject: mailmap: add entry for Viacheslav Bocharov Map my address to new personal address Old domain lexina.in will no longer be accessible due to registration expiration. Link: https://lkml.kernel.org/r/20260120082212.364268-1-adeep@lexina.in Signed-off-by: Viacheslav Bocharov Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 4a8a160f28ed..a64f269dd7cb 100644 --- a/.mailmap +++ b/.mailmap @@ -850,6 +850,7 @@ Valentin Schneider Veera Sundaram Sankaran Veerabhadrarao Badiganti Venkateswara Naralasetty +Viacheslav Bocharov Vikash Garodia Vikash Garodia Vincent Mailhol -- cgit v1.2.3 From dd9e2f5b38f1fdd49b1ab6d3a85f81c14369eacc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 21 Jan 2026 12:27:30 +0100 Subject: flex_proportions: make fprop_new_period() hardirq safe Bernd has reported a lockdep splat from flexible proportions code that is essentially complaining about the following race: run_timer_softirq - we are in softirq context call_timer_fn writeout_period fprop_new_period write_seqcount_begin(&p->sequence); ... blk_mq_end_request() blk_update_request() ext4_end_bio() folio_end_writeback() __wb_writeout_add() __fprop_add_percpu_max() if (unlikely(max_frac < FPROP_FRAC_BASE)) { fprop_fraction_percpu() seq = read_seqcount_begin(&p->sequence); - sees odd sequence so loops indefinitely Note that a deadlock like this is only possible if the bdi has configured maximum fraction of writeout throughput which is very rare in general but frequent for example for FUSE bdis. To fix this problem we have to make sure write section of the sequence counter is irqsafe. Link: https://lkml.kernel.org/r/20260121112729.24463-2-jack@suse.cz Fixes: a91befde3503 ("lib/flex_proportions.c: remove local_irq_ops in fprop_new_period()") Signed-off-by: Jan Kara Reported-by: Bernd Schubert Link: https://lore.kernel.org/all/9b845a47-9aee-43dd-99bc-1a82bea00442@bsbernd.com/ Reviewed-by: Matthew Wilcox (Oracle) Cc: Joanne Koong Cc: Miklos Szeredi Cc: Signed-off-by: Andrew Morton --- lib/flex_proportions.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 84ecccddc771..012d5614efb9 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -64,13 +64,14 @@ void fprop_global_destroy(struct fprop_global *p) bool fprop_new_period(struct fprop_global *p, int periods) { s64 events = percpu_counter_sum(&p->events); + unsigned long flags; /* * Don't do anything if there are no events. */ if (events <= 1) return false; - preempt_disable_nested(); + local_irq_save(flags); write_seqcount_begin(&p->sequence); if (periods < 64) events -= events >> periods; @@ -78,7 +79,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) percpu_counter_add(&p->events, -events); p->period += periods; write_seqcount_end(&p->sequence); - preempt_enable_nested(); + local_irq_restore(flags); return true; } -- cgit v1.2.3 From 71e2b5eadbad43d33f0e2cf6d767395273ba5eaa Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Thu, 22 Jan 2026 16:18:39 +0100 Subject: memfd: export alloc_file() Patch series "mm: memfd_luo hotfixes". This series contains a couple of fixes for memfd preservation using LUO. This patch (of 3): The Live Update Orchestrator's (LUO) memfd preservation works by preserving all the folios of a memfd, re-creating an empty memfd on the next boot, and then inserting back the preserved folios. Currently it creates the file by directly calling shmem_file_setup(). This leaves out other work done by alloc_file() like setting up the file mode, flags, or calling the security hooks. Export alloc_file() to let memfd_luo use it. Rename it to memfd_alloc_file() since it is no longer private and thus needs a subsystem prefix. Link: https://lkml.kernel.org/r/20260122151842.4069702-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20260122151842.4069702-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/memfd.h | 6 ++++++ mm/memfd.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index cc74de3dbcfe..c328a7b356d0 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -17,6 +17,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); * to by vm_flags_ptr. */ int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr); +struct file *memfd_alloc_file(const char *name, unsigned int flags); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -31,6 +32,11 @@ static inline int memfd_check_seals_mmap(struct file *file, { return 0; } + +static inline struct file *memfd_alloc_file(const char *name, unsigned int flags) +{ + return ERR_PTR(-EINVAL); +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/mm/memfd.c b/mm/memfd.c index ab5312aff14b..f032c6052926 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -456,7 +456,7 @@ err_name: return ERR_PTR(error); } -static struct file *alloc_file(const char *name, unsigned int flags) +struct file *memfd_alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; @@ -520,5 +520,5 @@ SYSCALL_DEFINE2(memfd_create, return PTR_ERR(name); fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; - return FD_ADD(fd_flags, alloc_file(name, flags)); + return FD_ADD(fd_flags, memfd_alloc_file(name, flags)); } -- cgit v1.2.3 From 02e117b8ca58928f193e5b4df48d6763232f5e91 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Thu, 22 Jan 2026 16:18:40 +0100 Subject: mm: memfd_luo: use memfd_alloc_file() instead of shmem_file_setup() When restoring a memfd, the file is created using shmem_file_setup(). While memfd creation also calls this function to get the file, it also does other things: 1. The O_LARGEFILE flag is set on the file. If this is not done, writes on the memfd exceeding 2 GiB fail. 2. FMODE_LSEEK, FMODE_PREAD, and FMODE_PWRITE are set on the file. This makes sure the file is seekable and can be used with pread() and pwrite(). 3. Initializes the security field for the inode and makes sure that inode creation is permitted by the security module. Currently, none of those things are done. This means writes above 2 GiB fail, pread(), and pwrite() fail, and so on. lseek() happens to work because file_init_path() sets it because shmem defines fop->llseek. Fix this by using memfd_alloc_file() to get the file to make sure the initialization sequence for normal and preserved memfd is the same. Link: https://lkml.kernel.org/r/20260122151842.4069702-3-pratyush@kernel.org Fixes: b3749f174d68 ("mm: memfd_luo: allow preserving memfd") Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/memfd_luo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 4f6ba63b4310..01a72e4d3ef6 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -78,6 +78,7 @@ #include #include #include +#include #include "internal.h" static int memfd_luo_preserve_folios(struct file *file, @@ -443,8 +444,7 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) if (!ser) return -EINVAL; - file = shmem_file_setup("", 0, VM_NORESERVE); - + file = memfd_alloc_file("", 0); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); return PTR_ERR(file); -- cgit v1.2.3 From c657c5dc1360fa1ed1b090aedb5883d9cf9f0a0f Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Thu, 22 Jan 2026 16:18:41 +0100 Subject: mm: memfd_luo: restore and free memfd_luo_ser on failure memfd_luo_ser has the serialization metadata. It is of no use once restoration fails. Free it on failure. Link: https://lkml.kernel.org/r/20260122151842.4069702-4-pratyush@kernel.org Fixes: b3749f174d68 ("mm: memfd_luo: allow preserving memfd") Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/memfd_luo.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 01a72e4d3ef6..a34fccc23b6a 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -447,7 +447,8 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) file = memfd_alloc_file("", 0); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); - return PTR_ERR(file); + err = PTR_ERR(file); + goto free_ser; } vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); @@ -473,7 +474,8 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) put_file: fput(file); - +free_ser: + kho_restore_free(ser); return err; } -- cgit v1.2.3 From e86436ad0ad2a9aaf88802d69b68f02cbd1f04a9 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Thu, 22 Jan 2026 13:27:40 +0000 Subject: kho: init alloc tags when restoring pages from reserved memory Memblock pages (including reserved memory) should have their allocation tags initialized to CODETAG_EMPTY via clear_page_tag_ref() before being released to the page allocator. When kho restores pages through kho_restore_page(), missing this call causes mismatched allocation/deallocation tracking and below warning message: alloc_tag was not set WARNING: include/linux/alloc_tag.h:164 at ___free_pages+0xb8/0x260, CPU#1: swapper/0/1 RIP: 0010:___free_pages+0xb8/0x260 kho_restore_vmalloc+0x187/0x2e0 kho_test_init+0x3c4/0xa30 do_one_initcall+0x62/0x2b0 kernel_init_freeable+0x25b/0x480 kernel_init+0x1a/0x1c0 ret_from_fork+0x2d1/0x360 Add missing clear_page_tag_ref() annotation in kho_restore_page() to fix this. Link: https://lkml.kernel.org/r/20260122132740.176468-1-ranxiaokai627@163.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Ran Xiaokai Reviewed-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Kent Overstreet Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index d4482b6e3cae..96767b106cac 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -255,6 +255,14 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) if (is_folio && info.order) prep_compound_page(page, info.order); + /* Always mark headpage's codetag as empty to avoid accounting mismatch */ + clear_page_tag_ref(page); + if (!is_folio) { + /* Also do that for the non-compound tail pages */ + for (unsigned int i = 1; i < nr_pages; i++) + clear_page_tag_ref(page + i); + } + adjust_managed_page_count(page, nr_pages); return page; } -- cgit v1.2.3 From 412a32f0e53f4a50062f6f4bc18f8910aa551734 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 21 Jan 2026 12:36:17 -0800 Subject: kho: kho_preserve_vmalloc(): don't return 0 when ENOMEM kho_preserve_vmalloc() should return -ENOMEM when new_vmalloc_chunk() fails. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202601211636.IRaejjdw-lkp@intel.com/ Reviewed-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Jason Gunthorpe Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 96767b106cac..90d411a59f76 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1014,8 +1014,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) chunk->phys[idx++] = phys; if (idx == ARRAY_SIZE(chunk->phys)) { chunk = new_vmalloc_chunk(chunk); - if (!chunk) + if (!chunk) { + err = -ENOMEM; goto err_free; + } idx = 0; } } -- cgit v1.2.3 From 870ff19251bf3910dda7a7245da826924045fedd Mon Sep 17 00:00:00 2001 From: Pimyn Girgis Date: Tue, 20 Jan 2026 17:15:10 +0100 Subject: mm/kfence: randomize the freelist on initialization Randomize the KFENCE freelist during pool initialization to make allocation patterns less predictable. This is achieved by shuffling the order in which metadata objects are added to the freelist using get_random_u32_below(). Additionally, ensure the error path correctly calculates the address range to be reset if initialization fails, as the address increment logic has been moved to a separate loop. Link: https://lkml.kernel.org/r/20260120161510.3289089-1-pimyn@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Pimyn Girgis Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Marco Elver Cc: Ernesto Martnez Garca Cc: Greg KH Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- mm/kfence/core.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index da0f5b6f5744..4f79ec720752 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -596,7 +596,7 @@ static void rcu_guarded_free(struct rcu_head *h) static unsigned long kfence_init_pool(void) { unsigned long addr, start_pfn; - int i; + int i, rand; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; @@ -647,13 +647,27 @@ static unsigned long kfence_init_pool(void) INIT_LIST_HEAD(&meta->list); raw_spin_lock_init(&meta->lock); meta->state = KFENCE_OBJECT_UNUSED; - meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ - list_add_tail(&meta->list, &kfence_freelist); + /* Use addr to randomize the freelist. */ + meta->addr = i; /* Protect the right redzone. */ - if (unlikely(!kfence_protect(addr + PAGE_SIZE))) + if (unlikely(!kfence_protect(addr + 2 * i * PAGE_SIZE + PAGE_SIZE))) goto reset_slab; + } + + for (i = CONFIG_KFENCE_NUM_OBJECTS; i > 0; i--) { + rand = get_random_u32_below(i); + swap(kfence_metadata_init[i - 1].addr, kfence_metadata_init[rand].addr); + } + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta_1 = &kfence_metadata_init[i]; + struct kfence_metadata *meta_2 = &kfence_metadata_init[meta_1->addr]; + + list_add_tail(&meta_2->list, &kfence_freelist); + } + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + kfence_metadata_init[i].addr = addr; addr += 2 * PAGE_SIZE; } @@ -666,6 +680,7 @@ static unsigned long kfence_init_pool(void) return 0; reset_slab: + addr += 2 * i * PAGE_SIZE; for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct page *page; -- cgit v1.2.3 From cbbbf7795fc35a740e1d09f3a55aba543b72a8d3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 22 Jan 2026 13:43:43 -0500 Subject: mm/mm_init: don't cond_resched() in deferred_init_memmap_chunk() if called from deferred_grow_zone() Commit 3acb913c9d5b ("mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()") made deferred_grow_zone() call deferred_init_memmap_chunk() within a pgdat_resize_lock() critical section with irqs disabled. It did check for irqs_disabled() in deferred_init_memmap_chunk() to avoid calling cond_resched(). For a PREEMPT_RT kernel build, however, spin_lock_irqsave() does not disable interrupt but rcu_read_lock() is called. This leads to the following bug report. BUG: sleeping function called from invalid context at mm/mm_init.c:2091 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0 preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 3 locks held by swapper/0/1: #0: ffff80008471b7a0 (sched_domains_mutex){+.+.}-{4:4}, at: sched_domains_mutex_lock+0x28/0x40 #1: ffff003bdfffef48 (&pgdat->node_size_lock){+.+.}-{3:3}, at: deferred_grow_zone+0x140/0x278 #2: ffff800084acf600 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1b4/0x408 CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.19.0-rc6-test #1 PREEMPT_{RT,(full) } Tainted: [W]=WARN Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0xdc/0xf8 dump_stack+0x1c/0x28 __might_resched+0x384/0x530 deferred_init_memmap_chunk+0x560/0x688 deferred_grow_zone+0x190/0x278 _deferred_grow_zone+0x18/0x30 get_page_from_freelist+0x780/0xf78 __alloc_frozen_pages_noprof+0x1dc/0x348 alloc_slab_page+0x30/0x110 allocate_slab+0x98/0x2a0 new_slab+0x4c/0x80 ___slab_alloc+0x5a4/0x770 __slab_alloc.constprop.0+0x88/0x1e0 __kmalloc_node_noprof+0x2c0/0x598 __sdt_alloc+0x3b8/0x728 build_sched_domains+0xe0/0x1260 sched_init_domains+0x14c/0x1c8 sched_init_smp+0x9c/0x1d0 kernel_init_freeable+0x218/0x358 kernel_init+0x28/0x208 ret_from_fork+0x10/0x20 Fix it adding a new argument to deferred_init_memmap_chunk() to explicitly tell it if cond_resched() is allowed or not instead of relying on some current state information which may vary depending on the exact kernel configuration options that are enabled. Link: https://lkml.kernel.org/r/20260122184343.546627-1-longman@redhat.com Fixes: 3acb913c9d5b ("mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()") Signed-off-by: Waiman Long Suggested-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Wei Yang Cc: Signed-off-by: Andrew Morton --- mm/mm_init.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..2a809cd8e7fa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2059,7 +2059,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, */ static unsigned long __init deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, - struct zone *zone) + struct zone *zone, bool can_resched) { int nid = zone_to_nid(zone); unsigned long nr_pages = 0; @@ -2085,10 +2085,10 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, spfn = chunk_end; - if (irqs_disabled()) - touch_nmi_watchdog(); - else + if (can_resched) cond_resched(); + else + touch_nmi_watchdog(); } } @@ -2101,7 +2101,7 @@ deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn, { struct zone *zone = arg; - deferred_init_memmap_chunk(start_pfn, end_pfn, zone); + deferred_init_memmap_chunk(start_pfn, end_pfn, zone, true); } static unsigned int __init @@ -2216,7 +2216,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1); nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone); spfn = epfn, epfn += PAGES_PER_SECTION) { - nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone); + nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone, false); } /* -- cgit v1.2.3 From 12b2285bf3d14372238d36215b73af02ac3bdfc1 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 16 Jan 2026 12:10:16 +0100 Subject: mm/zone_device: reinitialize large zone device private folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reinitialize metadata for large zone device private folios in zone_device_page_init prior to creating a higher-order zone device private folio. This step is necessary when the folio's order changes dynamically between zone_device_page_init calls to avoid building a corrupt folio. As part of the metadata reinitialization, the dev_pagemap must be passed in from the caller because the pgmap stored in the folio page may have been overwritten with a compound head. Without this fix, individual pages could have invalid pgmap fields and flags (with PG_locked being notably problematic) due to prior different order allocations, which can, and will, result in kernel crashes. Link: https://lkml.kernel.org/r/20260116111325.1736137-2-francois.dugast@intel.com Fixes: d245f9b4ab80 ("mm/zone_device: support large zone device private folios") Signed-off-by: Matthew Brost Signed-off-by: Francois Dugast Acked-by: Felix Kuehling Reviewed-by: Balbir Singh Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Alistair Popple Cc: Madhavan Srinivasan Cc: Nicholas Piggin Cc: Michael Ellerman Cc: "Christophe Leroy (CS GROUP)" Cc: Alex Deucher Cc: "Christian König" Cc: David Airlie Cc: Simona Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Hildenbrand Cc: Oscar Salvador Cc: Andrew Morton Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/drm_pagemap.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 9 +++++--- lib/test_hmm.c | 4 +++- mm/memremap.c | 35 +++++++++++++++++++++++++++++++- 7 files changed, 47 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e5000bef90f2..7cf9310de0ec 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -723,7 +723,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - zone_device_page_init(dpage, 0); + zone_device_page_init(dpage, &kvmppc_uvmem_pgmap, 0); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index af53e796ea1b..6ada7b4af7c6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -217,7 +217,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - zone_device_page_init(page, 0); + zone_device_page_init(page, page_pgmap(page), 0); } static void diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 06c1bd8fc4d1..704f2f945019 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -197,7 +197,7 @@ static void drm_pagemap_get_devmem_page(struct page *page, struct drm_pagemap_zdd *zdd) { page->zone_device_data = drm_pagemap_zdd_get(zdd); - zone_device_page_init(page, 0); + zone_device_page_init(page, page_pgmap(page), 0); } /** diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 58071652679d..3d8031296eed 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -425,7 +425,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm, bool is_large) order = ilog2(DMEM_CHUNK_NPAGES); } - zone_device_folio_init(folio, order); + zone_device_folio_init(folio, page_pgmap(folio_page(folio, 0)), order); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 713ec0435b48..e3c2ccf872a8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -224,7 +224,8 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void zone_device_page_init(struct page *page, unsigned int order); +void zone_device_page_init(struct page *page, struct dev_pagemap *pgmap, + unsigned int order); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -234,9 +235,11 @@ bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); -static inline void zone_device_folio_init(struct folio *folio, unsigned int order) +static inline void zone_device_folio_init(struct folio *folio, + struct dev_pagemap *pgmap, + unsigned int order) { - zone_device_page_init(&folio->page, order); + zone_device_page_init(&folio->page, pgmap, order); if (order) folio_set_large_rmappable(folio); } diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 8af169d3873a..455a6862ae50 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -662,7 +662,9 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror, goto error; } - zone_device_folio_init(page_folio(dpage), order); + zone_device_folio_init(page_folio(dpage), + page_pgmap(folio_page(page_folio(dpage), 0)), + order); dpage->zone_device_data = rpage; return dpage; diff --git a/mm/memremap.c b/mm/memremap.c index 63c6ab4fdf08..ac7be07e3361 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -477,10 +477,43 @@ void free_zone_device_folio(struct folio *folio) } } -void zone_device_page_init(struct page *page, unsigned int order) +void zone_device_page_init(struct page *page, struct dev_pagemap *pgmap, + unsigned int order) { + struct page *new_page = page; + unsigned int i; + VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES); + for (i = 0; i < (1UL << order); ++i, ++new_page) { + struct folio *new_folio = (struct folio *)new_page; + + /* + * new_page could have been part of previous higher order folio + * which encodes the order, in page + 1, in the flags bits. We + * blindly clear bits which could have set my order field here, + * including page head. + */ + new_page->flags.f &= ~0xffUL; /* Clear possible order, page head */ + +#ifdef NR_PAGES_IN_LARGE_FOLIO + /* + * This pointer math looks odd, but new_page could have been + * part of a previous higher order folio, which sets _nr_pages + * in page + 1 (new_page). Therefore, we use pointer casting to + * correctly locate the _nr_pages bits within new_page which + * could have modified by previous higher order folio. + */ + ((struct folio *)(new_page - 1))->_nr_pages = 0; +#endif + + new_folio->mapping = NULL; + new_folio->pgmap = pgmap; /* Also clear compound head */ + new_folio->share = 0; /* fsdax only, unused for device private */ + VM_WARN_ON_FOLIO(folio_ref_count(new_folio), new_folio); + VM_WARN_ON_FOLIO(!folio_is_zone_device(new_folio), new_folio); + } + /* * Drivers shouldn't be allocating pages after calling * memunmap_pages(). -- cgit v1.2.3 From bd58782995a2e6a07fd07255f3cc319f40b131c9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 22 Jan 2026 02:39:36 -0800 Subject: vmcoreinfo: make hwerr_data visible for debugging If the kernel is compiled with LTO, hwerr_data symbol might be lost, and vmcoreinfo doesn't have it dumped. This is currently seen in some production kernels with LTO enabled. Remove the static qualifier from hwerr_data so that the information is still preserved when the kernel is built with LTO. Making hwerr_data a global symbol ensures its debug info survives the LTO link process and appears in kallsyms. Also document it, so it doesn't get removed in the future as suggested by akpm. Link: https://lkml.kernel.org/r/20260122-fix_vmcoreinfo-v2-1-2d6311f9e36c@debian.org Fixes: 3fa805c37dd4 ("vmcoreinfo: track and log recoverable hardware errors") Signed-off-by: Breno Leitao Acked-by: Baoquan He Cc: Dave Young Cc: "Luck, Tony" Cc: Omar Sandoval Cc: Shuai Xue Cc: Vivek Goyal Cc: Zhiquan Li Signed-off-by: Andrew Morton --- kernel/vmcore_info.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index fe9bf8db1922..e2784038bbed 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -36,7 +36,11 @@ struct hwerr_info { time64_t timestamp; }; -static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; +/* + * The hwerr_data[] array is declared with global scope so that it remains + * accessible to vmcoreinfo even when Link Time Optimization (LTO) is enabled. + */ +struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) -- cgit v1.2.3