From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:33 +0000 Subject: mm: handle poisoning of pfn without struct pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region mapped using remap_pfn_range() for example, but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register the device memory SPA and the address space associated it. MM maintains this information as an interval tree. On poison, MM can search for the range that the poisoned PFN belong and use the address_space to determine the mapping VMA. In this implementation, kernel MM follows the following sequence that is largely similar to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() collects the processes mapped to the PFN. 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the faulty PFN using kill_procs(). Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com Signed-off-by: Ankit Agrawal Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/ras/ras_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/ras') diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..fecfeb7c8be7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* -- cgit v1.2.3 From 31807483d3952059d395c2a73b1fa9625db9b366 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Wed, 19 Nov 2025 17:59:43 +0800 Subject: mm/memory-failure: remove the selection of RAS commit 97f0b13452198290799f ("tracing: add trace event for memory-failure") introduces the selection of RAS in memory-failure. This commit is just a tracing feature; in reality, there is no dependency between memory-failure and RAS. RAS increases the size of the bzImage image by 8k, which is very valuable for embedded devices. Move the memory-failure traceing code from ras_event.h to memory-failure.h and remove the selection of RAS. Link: https://lkml.kernel.org/r/20251119095943.67125-1-xieyuanbin1@huawei.com Signed-off-by: Xie Yuanbin Acked-by: David Hildenbrand (Red Hat) Acked-by: Miaohe Lin Cc: Borislav Petkov Signed-off-by: Andrew Morton --- include/ras/ras_event.h | 87 ------------------------------------------------- 1 file changed, 87 deletions(-) (limited to 'include/ras') diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index fecfeb7c8be7..1e5e87020eef 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -12,7 +12,6 @@ #include #include #include -#include /* * MCE Extended Error Log trace event @@ -339,92 +338,6 @@ TRACE_EVENT(aer_event, "Not available") ); #endif /* CONFIG_PCIEAER */ - -/* - * memory-failure recovery action result event - * - * unsigned long pfn - Page Frame Number of the corrupted page - * int type - Page types of the corrupted page - * int result - Result of recovery action - */ - -#ifdef CONFIG_MEMORY_FAILURE -#define MF_ACTION_RESULT \ - EM ( MF_IGNORED, "Ignored" ) \ - EM ( MF_FAILED, "Failed" ) \ - EM ( MF_DELAYED, "Delayed" ) \ - EMe ( MF_RECOVERED, "Recovered" ) - -#define MF_PAGE_TYPE \ - EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ - EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ - EM ( MF_MSG_HUGE, "huge page" ) \ - EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ - EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ - EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ - EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ - EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ - EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ - EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ - EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ - EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ - EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ - EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ - EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ - EM ( MF_MSG_BUDDY, "free buddy page" ) \ - EM ( MF_MSG_DAX, "dax page" ) \ - EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ - EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ - EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ - EMe ( MF_MSG_UNKNOWN, "unknown page" ) - -/* - * First define the enums in MM_ACTION_RESULT to be exported to userspace - * via TRACE_DEFINE_ENUM(). - */ -#undef EM -#undef EMe -#define EM(a, b) TRACE_DEFINE_ENUM(a); -#define EMe(a, b) TRACE_DEFINE_ENUM(a); - -MF_ACTION_RESULT -MF_PAGE_TYPE - -/* - * Now redefine the EM() and EMe() macros to map the enums to the strings - * that will be printed in the output. - */ -#undef EM -#undef EMe -#define EM(a, b) { a, b }, -#define EMe(a, b) { a, b } - -TRACE_EVENT(memory_failure_event, - TP_PROTO(unsigned long pfn, - int type, - int result), - - TP_ARGS(pfn, type, result), - - TP_STRUCT__entry( - __field(unsigned long, pfn) - __field(int, type) - __field(int, result) - ), - - TP_fast_assign( - __entry->pfn = pfn; - __entry->type = type; - __entry->result = result; - ), - - TP_printk("pfn %#lx: recovery action for %s: %s", - __entry->pfn, - __print_symbolic(__entry->type, MF_PAGE_TYPE), - __print_symbolic(__entry->result, MF_ACTION_RESULT) - ) -); -#endif /* CONFIG_MEMORY_FAILURE */ #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ -- cgit v1.2.3