From 83ec6eeb74a592e6568cb0723bac99fb8b3810b4 Mon Sep 17 00:00:00 2001 From: Ian Ray Date: Wed, 6 May 2026 09:33:35 +0300 Subject: MAINTAINERS: .mailmap: update after GEHC spin-off Update my email address from @ge.com to @gehealthcare.com after GE HealthCare was spun-off from GE. Link: https://lore.kernel.org/20260506063335.3-1-ian.ray@gehealthcare.com Signed-off-by: Ian Ray Reviewed-by: Luca Ceresoli Cc: Neil Armstrong Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index bd6a72f29d9c..de41cfdae5d0 100644 --- a/.mailmap +++ b/.mailmap @@ -339,6 +339,7 @@ Henrik Rydberg Herbert Xu Huacai Chen Huacai Chen +Ian Ray Ignat Korchagin Igor Korotin Ike Panhc diff --git a/MAINTAINERS b/MAINTAINERS index 10e8253181d3..9eb15dacb939 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16504,7 +16504,7 @@ F: drivers/usb/mtu3/ MEGACHIPS STDPXXXX-GE-B850V3-FW LVDS/DP++ BRIDGES M: Peter Senna Tschudin -M: Ian Ray +M: Ian Ray M: Martyn Welch S: Maintained F: Documentation/devicetree/bindings/display/bridge/megachips-stdpxxxx-ge-b850v3-fw.txt -- cgit v1.2.3 From 83f9efcce93f8574be2279090ee2aec58b86cda7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 May 2026 17:06:43 +0100 Subject: Revert "mm/hugetlbfs: update hugetlbfs to use mmap_prepare" This reverts commit ea52cb24cd3f ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare") with conflict resolution to account for changes in commit ea52cb24cd3f ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare"). The patch incorrectly handled hugetlb VMA lock allocation at the mmap_prepare stage, where a failed allocation occurring after mmap_prepare is called might result in the lock leaking. There is no risk of a merge causing a similar issues, as VMA_DONTEXPAND_BIT is set for hugetlb mappings. As a first step in addressing this issue, simply revert the change so we can rework how we do this having corrected the underlying issues. We maintain the VMA flags changes as best we can, accounting for the fact that we were working with a VMA descriptor previously and propagating like-for-like changes for this. Note that we invoke vma_set_flags() and do not call vma_start_write() as vm_flags_set() does. This is OK as it's being done in an .mmap hook where the VMA is not yet linked into the tree so nobody else can be accessing it. Link: https://lore.kernel.org/20260512160643.266960-1-ljs@kernel.org Fixes: ea52cb24cd3f ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare") Signed-off-by: Lorenzo Stoakes Reported-by: Mingyu Wang <25181214217@stu.xidian.edu.cn> Closes: https://lore.kernel.org/linux-mm/20260425070700.562229-1-25181214217@stu.xidian.edu.cn/ Acked-by: Muchun Song Acked-by: Oscar Salvador Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Pedro Falcato Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 46 ++++++++------------------- include/linux/hugetlb.h | 8 +---- include/linux/hugetlb_inline.h | 14 ++------- mm/hugetlb.c | 71 +++++++++++++++++------------------------- 4 files changed, 45 insertions(+), 94 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8b05bec08e04..78d61bf2bd9b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,15 +96,8 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) +static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - /* Unfortunate we have to reassign vma->vm_private_data. */ - return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); -} - -static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) -{ - struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -119,8 +112,8 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); - desc->vm_ops = &hugetlb_vm_ops; + vma_set_flags(vma, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); + vma->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -129,16 +122,16 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (desc->pgoff & PGOFF_LOFFT_MAX) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)vma_desc_size(desc); - len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); + vma_len = (loff_t)(vma->vm_end - vma->vm_start); + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -148,7 +141,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) ret = -ENOMEM; - vma_flags = desc->vma_flags; + vma_flags = vma->flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -158,30 +151,17 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) vma_flags_set(&vma_flags, VMA_NORESERVE_BIT); if (hugetlb_reserve_pages(inode, - desc->pgoff >> huge_page_order(h), - len >> huge_page_shift(h), desc, - vma_flags) < 0) + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma, + vma_flags) < 0) goto out; ret = 0; - if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_test(vma, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); - if (!ret) { - /* Allocate the VMA lock after we set it up. */ - desc->action.success_hook = hugetlb_file_mmap_prepare_success; - /* - * We cannot permit the rmap finding this VMA in the time - * between the VMA being inserted into the VMA tree and the - * completion/success hook being invoked. - * - * This is because we establish a per-VMA hugetlb lock which can - * be raced by rmap. - */ - desc->action.hide_from_rmap_until_complete = true; - } return ret; } @@ -1227,7 +1207,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap_prepare = hugetlbfs_file_mmap_prepare, + .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 93418625d3c5..5957bc25efa8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -148,7 +148,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_desc *desc, vma_flags_t vma_flags); + struct vm_area_struct *vma, vma_flags_t vma_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -276,7 +276,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); -int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); unsigned int arch_hugetlb_cma_order(void); @@ -469,11 +468,6 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} -static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) -{ - return 0; -} - #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 565b473fd135..5c29cd3223a1 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -6,23 +6,13 @@ #ifdef CONFIG_HUGETLB_PAGE -static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) -{ - return !!(vm_flags & VM_HUGETLB); -} - static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { - return vma_flags_test_any(flags, VMA_HUGETLB_BIT); + return vma_flags_test(flags, VMA_HUGETLB_BIT); } #else -static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) -{ - return false; -} - static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { return false; @@ -32,7 +22,7 @@ static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) static inline bool is_vm_hugetlb_page(const struct vm_area_struct *vma) { - return is_vm_hugetlb_flags(vma->vm_flags); + return is_vma_hugetlb_flags(&vma->flags); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f24bf49be047..4b80b167cc9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -116,6 +116,7 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -413,21 +414,17 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -/* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ -int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return 0; + return; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return -EINVAL; + return; vma_lock = kmalloc_obj(*vma_lock); if (!vma_lock) { @@ -442,15 +439,13 @@ int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return -EINVAL; + return; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; - - return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1147,28 +1142,20 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); - VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(vma_test(vma, VMA_MAYSHARE_BIT), vma); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + set_vma_private_data(vma, (unsigned long)map); } -static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) -{ - VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); - - desc->private_data = map; -} - -static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma_test(vma, VMA_MAYSHARE_BIT), vma); - desc->private_data = (void *)((unsigned long)desc->private_data | flags); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1178,13 +1165,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) -{ - VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - - return ((unsigned long)desc->private_data) & flag; -} - bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -6553,7 +6533,7 @@ next: long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_desc *desc, + struct vm_area_struct *vma, vma_flags_t vma_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; @@ -6570,6 +6550,12 @@ long hugetlb_reserve_pages(struct inode *inode, return -EINVAL; } + /* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ + hugetlb_vma_lock_alloc(vma); + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -6582,9 +6568,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !desc is a shm mapping + * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { + if (!vma || vma_test(vma, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6603,8 +6589,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_desc_resv_map(desc, resv_map); - set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) { @@ -6618,7 +6604,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (vma && !vma_test(vma, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6655,7 +6641,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { + if (!vma || vma_test(vma, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6719,15 +6705,16 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) + hugetlb_vma_lock_free(vma); + if (!vma || vma_test(vma, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_desc_resv_map(desc, NULL); + set_vma_resv_map(vma, NULL); } return err; } -- cgit v1.2.3 From fa0b9b2b7ae3539908d69c2b9ac0d144d9bc5139 Mon Sep 17 00:00:00 2001 From: Linpu Yu Date: Sun, 10 May 2026 13:43:30 +0800 Subject: ipc: limit next_id allocation to the valid ID range The checkpoint/restore sysctl path can request the next SysV IPC id through ids->next_id. ipc_idr_alloc() currently forwards that request to idr_alloc() with an open-ended upper bound. If the valid tail of the SysV IPC id space is full, the allocation can spill beyond ipc_mni. The returned SysV IPC id still uses the normal index encoding, so later lookup and removal can target the wrong slot. This leaves the real IDR entry behind and breaks the IDR state for the object. The bug is in ipc_idr_alloc() in the checkpoint/restore path. 1. ids->next_id is passed to: idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, ...) 2. The zero upper bound makes the allocation effectively open-ended. Once the valid SysV IPC tail is occupied, idr_alloc() can spill past ipc_mni and allocate an entry beyond the valid IPC id range. 3. The new object id is still encoded with the narrower SysV IPC index width: new->id = (new->seq << ipcmni_seq_shift()) + idx 4. Later removal goes through ipc_rmid(), which uses: ipcid_to_idx(ipcp->id) That truncates the real IDR index. An object actually stored at a high index can then be removed as if it lived at a low in-range index. 5. For shared memory, shm_destroy() frees the current object anyway, but the real high IDR slot is left behind as a dangling pointer. 6. A subsequent walk of /proc/sysvipc/shm reaches the stale IDR entry and dereferences freed memory. Prevent this by bounding the requested allocation to ipc_mni so the checkpoint/restore path fails once the valid range is exhausted. Link: https://lore.kernel.org/cover.1778336914.git.linpu5433@gmail.com Link: https://lore.kernel.org/2eebe949bfa7d1f6e13b5be6a92c64c850ce9d45.1778336914.git.linpu5433@gmail.com Fixes: 03f595668017 ("ipc: add sysctl to specify desired next object id") Signed-off-by: Linpu Yu Signed-off-by: Ren Wei Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Cc: Kees Cook Cc: Stanislav Kinsbursky Cc: Davidlohr Bueso Cc: Signed-off-by: Andrew Morton --- ipc/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/util.c b/ipc/util.c index 9eb89820594e..1737d776bc08 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -253,7 +253,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), - 0, GFP_NOWAIT); + ipc_mni, GFP_NOWAIT); } if (idx >= 0) new->id = (new->seq << ipcmni_seq_shift()) + idx; -- cgit v1.2.3 From 3b041514cb6eae45869b020f743c14d983363222 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Tue, 5 May 2026 15:39:20 +0200 Subject: memfd: deny writeable mappings when implying SEAL_WRITE When SEAL_EXEC is added, SEAL_WRITE is implied to make W^X. But the implied seal is set after the check that makes sure the memfd can not have any writable mappings. This means one can use SEAL_EXEC to apply SEAL_WRITE while having writeable mappings. This breaks the contract that SEAL_WRITE provides and can be used by an attacker to pass a memfd that appears to be write sealed but can still be modified arbitrarily. Fix this by adding the implied seals before the call for mapping_deny_writable() is done. Link: https://lore.kernel.org/20260505133922.797635-1-pratyush@kernel.org Fixes: c4f75bc8bd6b ("mm/memfd: add write seals when apply SEAL_EXEC to executable memfd") Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Pasha Tatashin Acked-by: Jeff Xu Cc: Baolin Wang Cc: Brendan Jackman Cc: Greg Thelen Cc: Hugh Dickins Cc: Kees Cook Cc: "David Hildenbrand (Arm)" Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index fb425f4e315f..abe13b291ddc 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -283,6 +283,12 @@ int memfd_add_seals(struct file *file, unsigned int seals) goto unlock; } + /* + * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. + */ + if (seals & F_SEAL_EXEC && inode->i_mode & 0111) + seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) @@ -295,12 +301,6 @@ int memfd_add_seals(struct file *file, unsigned int seals) } } - /* - * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. - */ - if (seals & F_SEAL_EXEC && inode->i_mode & 0111) - seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; - *file_seals |= seals; error = 0; -- cgit v1.2.3 From bf62f69574b19720ae5fbbbcdf24a0c4e3e05e43 Mon Sep 17 00:00:00 2001 From: Richard Chang Date: Tue, 12 May 2026 07:49:18 +0000 Subject: zram: fix use-after-free in zram_writeback_endio A crash was observed in zram_writeback_endio due to a NULL pointer dereference in wake_up. The root cause is a race condition between the bio completion handler (zram_writeback_endio) and the writeback task. In zram_writeback_endio, wake_up() is called on &wb_ctl->done_wait after releasing wb_ctl->done_lock. This creates a race window where the writeback task can see num_inflight become 0, return, and free wb_ctl before zram_writeback_endio calls wake_up(). CPU 0 (zram_writeback_endio) CPU 1 (writeback_store) ============================ ============================ zram_writeback_slots zram_submit_wb_request zram_submit_wb_request wait_event(wb_ctl->done_wait) spin_lock(&wb_ctl->done_lock); list_add(&req->entry, &wb_ctl->done_reqs); spin_unlock(&wb_ctl->done_lock); wake_up(&wb_ctl->done_wait); zram_complete_done_reqs spin_lock(&wb_ctl->done_lock); list_add(&req->entry, &wb_ctl->done_reqs); spin_unlock(&wb_ctl->done_lock); while (num_inflight) > 0) spin_lock(&wb_ctl->done_lock); list_del(&req->entry); spin_unlock(&wb_ctl->done_lock); // num_inflight becomes 0 atomic_dec(num_inflight); // Leave zram_writeback_slots // Free wb_ctl release_wb_ctl(wb_ctl); // UAF crash! wake_up(&wb_ctl->done_wait); This patch fixes this race by using RCU. By protecting wb_ctl with rcu_read_lock() in zram_writeback_endio and using kfree_rcu() to free it, we ensure that wb_ctl remains valid during the execution of zram_writeback_endio. Link: https://lore.kernel.org/20260512074918.2606208-1-richardycc@google.com Fixes: f405066a1f0d ("zram: introduce writeback bio batching") Signed-off-by: Richard Chang Suggested-by: Sergey Senozhatsky Suggested-by: Minchan Kim Acked-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Brian Geffon Cc: Jens Axboe Cc: Martin Liu Cc: wang wei Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aebc710f0d6a..07111455eecf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "zram_drv.h" @@ -504,6 +505,7 @@ struct zram_wb_ctl { wait_queue_head_t done_wait; spinlock_t done_lock; atomic_t num_inflight; + struct rcu_head rcu; }; struct zram_wb_req { @@ -847,7 +849,7 @@ static void release_wb_ctl(struct zram_wb_ctl *wb_ctl) release_wb_req(req); } - kfree(wb_ctl); + kfree_rcu(wb_ctl, rcu); } static struct zram_wb_ctl *init_wb_ctl(struct zram *zram) @@ -964,11 +966,13 @@ static void zram_writeback_endio(struct bio *bio) struct zram_wb_ctl *wb_ctl = bio->bi_private; unsigned long flags; + rcu_read_lock(); spin_lock_irqsave(&wb_ctl->done_lock, flags); list_add(&req->entry, &wb_ctl->done_reqs); spin_unlock_irqrestore(&wb_ctl->done_lock, flags); wake_up(&wb_ctl->done_wait); + rcu_read_unlock(); } static void zram_submit_wb_request(struct zram *zram, -- cgit v1.2.3 From 3f8968e9cbf95d5d87d32218906cab0b9b9eddbe Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 18 May 2026 12:06:56 +0530 Subject: mm/rmap: initialize nr_pages to 1 at loop start in try_to_unmap_one Initialize nr_pages to 1 at the start of each loop iteration, like folio_referenced_one() does. Without this, nr_pages computed by a previous folio_unmap_pte_batch() call can be reused on a later iteration that does not run folio_unmap_pte_batch() again. mmap a 64K large folio with MAP_ANONYMOUS | MAP_DROPPABLE, then call madvise(MADV_FREE), then make the last page device-exclusive via HMM_DMIRROR_EXCLUSIVE. Trigger node reclaim through sysfs. Now, in try_to_unmap_one(), we will first clear the first 15 out of 16 entries mapping the lazyfree folio. This will set nr_pages to 15. In the next pvmw walk, this nr_pages gets reused on a device-exclusive pte, thus potentially corrupting folio refcount/mapcount. At the moment, I have a userspace program which can make the kernel spit out a trace, but the blow up is in folio_referenced_one(), because there are existing bugs in the interaction between device-private and rmap (which too I am investigating). I did a one liner kernel change to avoid going into folio_referenced_one(), and the kernel blows up at folio_remove_rmap_ptes in try_to_unmap_one which is what I wanted. Note that the bug is there not since file folio batching but lazyfree folio batching, since device-exclusive only works for anonymous folios. Userspace visible effect is simply kernel crashing somewhere due to refcount/mapcount corruption. Link: https://lore.kernel.org/20260518063656.3721056-1-dev.jain@arm.com Fixes: 354dffd29575 ("mm: support batched unmap for lazyfree large folios during reclamation") Signed-off-by: Dev Jain Acked-by: Barry Song Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Cc: Anshuman Khandual Cc: Barry Song Cc: Dev Jain Cc: Harry Yoo Cc: Jann Horn Cc: Liam R. Howlett Cc: Rik van Riel Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/rmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/rmap.c b/mm/rmap.c index 78b7fb5f367c..99e1b3dc390b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2030,6 +2030,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + nr_pages = 1; + /* * If the folio is in an mlock()d vma, we must not swap it out. */ -- cgit v1.2.3 From 441f92f7d386b85bad16de49db95a307cba048a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 08:25:58 -0700 Subject: mm/damon/sysfs-schemes: delete tried region in regions_rmdirs() DAMON sysfs maintains the DAMOS tried region directory objects via a linked list. When the user requests refresh of the directories, DAMON sysfs removes all the region directories first, and then generate updated regions directory on the empty space. The removal function (damon_sysfs_scheme_regions_rm_dirs()) only puts the kobj objects. Deletion of the container region object from the linked list is done inside the kobj release callback function. If somehow the callback invocation is delayed, the list will contain regions list that gonna be freed. If the updated region directories creation is started in this situation, the list can be corrupted and use-after-free can happen. Because the kobj objects are managed by only DAMON sysfs, the issue cannot happen in normal situation. But, such delays can be made on kernels that built with CONFIG_DEBUG_KOBJECT_RELEASE. On the kernel, the issue can indeed be reproduced like below. # damo start --damos_action stat # cd /sys/kernel/mm/damon/admin/kdamonds/0/ # for i in {1..10}; do echo update_schemes_tried_regions > state; done # dmesg | grep underflow [ 89.296152] refcount_t: underflow; use-after-free. Fix the issue by removing the region object from the list when decrementing the reference count. Also update damos_sysfs_populate_region_dir() to add the region object to the list only after the kobject_init_and_add() is success, so that fail of kobject_init_and_add() is not leaving the deallocated object on the list. The issue was discovered [1] by Sashiko. Link: https://lore.kernel.org/20260518152559.93038-1-sj@kernel.org Link: https://lore.kernel.org/20260513011920.119183-1-sj@kernel.org [1] Fixes: 9277d0367ba1 ("mm/damon/sysfs-schemes: implement scheme region directory") Signed-off-by: SeongJae Park Cc: # 6.2.x Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 04746cbb3327..a8014780edae 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -88,7 +88,6 @@ static void damon_sysfs_scheme_region_release(struct kobject *kobj) struct damon_sysfs_scheme_region *region = container_of(kobj, struct damon_sysfs_scheme_region, kobj); - list_del(®ion->list); kfree(region); } @@ -164,7 +163,7 @@ static void damon_sysfs_scheme_regions_rm_dirs( struct damon_sysfs_scheme_region *r, *next; list_for_each_entry_safe(r, next, ®ions->regions_list, list) { - /* release function deletes it from the list */ + list_del(&r->list); kobject_put(&r->kobj); regions->nr_regions--; } @@ -2928,14 +2927,15 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, if (!region) return; region->sz_filter_passed = sz_filter_passed; - list_add_tail(®ion->list, &sysfs_regions->regions_list); - sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", sysfs_regions->nr_regions++)) { kobject_put(®ion->kobj); + return; } + list_add_tail(®ion->list, &sysfs_regions->regions_list); + sysfs_regions->nr_regions++; } int damon_sysfs_schemes_clear_regions( -- cgit v1.2.3 From e16f17a9c5af50221184d1ef4be4056bf3c4209e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 18 May 2026 10:28:19 +0200 Subject: mm: memcontrol: propagate NMI slab stats to memcg vmstats flush_nmi_stats() drains per-node NMI slab atomics into the per-node lruvec_stats, but does not propagate them to the memcg-level vmstats. For non NMI case, account_slab_nmi_safe() calls mod_memcg_lruvec_state() which updates both per-node lruvec_stats and memcg-level vmstats, so flush_nmi_stats() needs to flush to per-node lruvec_stats as well as memcg-level vmstats. So fix this by flushing to the memcg-level vmstats for NMI too. Link: https://lore.kernel.org/20260518082830.599102-1-alex@ghiti.fr Fixes: 940b01fc8dc1 ("memcg: nmi safe memcg stats for specific archs") Signed-off-by: Alexandre Ghiti Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reviewed-by: Harry Yoo (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c03d4787d466..507be1cca392 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4352,6 +4352,9 @@ static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, lstats->state[index] += slab; if (plstats) plstats->state_pending[index] += slab; + memcg->vmstats->state[index] += slab; + if (parent) + parent->vmstats->state_pending[index] += slab; } if (atomic_read(&pn->slab_unreclaimable)) { int slab = atomic_xchg(&pn->slab_unreclaimable, 0); @@ -4360,6 +4363,9 @@ static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, lstats->state[index] += slab; if (plstats) plstats->state_pending[index] += slab; + memcg->vmstats->state[index] += slab; + if (parent) + parent->vmstats->state_pending[index] += slab; } } } -- cgit v1.2.3 From 09e7827e785729f391c8d46dc71becce70d296ab Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Mon, 16 Mar 2026 20:49:56 +0530 Subject: kernel/fork: validate exit_signal in kernel_clone() When a child process exits, it sends exit_signal to its parent via do_notify_parent(). The clone() syscall constructs exit_signal as: (lower_32_bits(clone_flags) & CSIGNAL) CSIGNAL is 0xff, so values in the range 65-255 are possible. However, valid_signal() only accepts signals up to _NSIG (64 on x86_64). A non-zero non-valid exit_signal acts the same as exit_signal == 0: the parent process is not signaled when the child terminates. The syzkaller reproducer triggers this by calling clone() with flags=0x80, resulting in exit_signal = (0x80 & CSIGNAL) = 128, which exceeds _NSIG and is not a valid signal. The v1 of this patch added the check only in the clone() syscall handler, which is incomplete. kernel_clone() has other callers such as sys_ia32_clone() which would remain unprotected. Move the check to kernel_clone() to cover all callers. Since the valid_signal() check is now in kernel_clone() and covers all callers including clone3(), the same check in copy_clone_args_from_user() becomes redundant and is removed. The higher 32bits check for clone3() is kept as it is clone3() specific. Note that this is a user-visible change: previously, passing an invalid exit_signal to clone() was silently accepted. The man page for clone() does not document any defined behavior for invalid exit_signal values, so rejecting them with -EINVAL is the correct behavior. It is unlikely that any sane application relies on passing an invalid exit_signal. [oleg@redhat.com: the comment above kernel_clone() should be updated] Link: https://lore.kernel.org/abwvgU17W8wuW2-J@redhat.com Link: https://lore.kernel.org/20260316151956.563558-1-kartikey406@gmail.com Fixes: 3f2c788a1314 ("fork: prevent accidental access to clone3 features") Signed-off-by: Deepanshu Kartikey Signed-off-by: Oleg Nesterov Reported-by: syzbot+bbe6b99feefc3a0842de@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bbe6b99feefc3a0842de Tested-by: syzbot+bbe6b99feefc3a0842de@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/20260307064202.353405-1-kartikey406@gmail.com/T/ [v1] Link: https://lore.kernel.org/all/20260316104536.558108-1-kartikey406@gmail.com/T/ [v2] Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Ben Segall Cc: Christian Brauner Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mel Gorman Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Cc: Tetsuo Handa Signed-off-by: Andrew Morton --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 5f3fdfdb14c7..8ac38beae360 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2664,8 +2664,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. - * - * args->exit_signal is expected to be checked for sanity by the caller. */ pid_t kernel_clone(struct kernel_clone_args *args) { @@ -2700,6 +2698,9 @@ pid_t kernel_clone(struct kernel_clone_args *args) (args->pidfd == args->parent_tid)) return -EINVAL; + if (!valid_signal(args->exit_signal)) + return -EINVAL; + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly @@ -2898,11 +2899,9 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, return -EINVAL; /* - * Verify that higher 32bits of exit_signal are unset and that - * it is a valid signal + * Verify that higher 32bits of exit_signal are unset */ - if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || - !valid_signal(args.exit_signal))) + if (unlikely(args.exit_signal & ~((u64)CSIGNAL))) return -EINVAL; if ((args.flags & CLONE_INTO_CGROUP) && -- cgit v1.2.3 From 2c6f81d58741349298f51ff697d988cb42881453 Mon Sep 17 00:00:00 2001 From: Sunny Patel Date: Fri, 1 May 2026 17:21:16 +0530 Subject: mm/migrate_device: fix pgtable leak in migrate_vma_insert_huge_pmd_page When migrate_vma_insert_huge_pmd_page() jumps to unlock_abort due to a PMD check failure, the pgtable allocated earlier via pte_alloc_one() is never freed, causing a memory leak. Added free_abort label to release the pgtable in error path. Link: https://lore.kernel.org/20260501115122.23288-1-nueralspacetech@gmail.com Fixes: a30b48bf1b24 ("mm/migrate_device: implement THP migration of zone device pages") Signed-off-by: Sunny Patel Acked-by: David Hildenbrand (Arm) Reviewed-by: Huang Ying Cc: Alistair Popple Cc: Balbir Singh Cc: Byungchul Park Cc: Gregory Price Cc: Joshua Hahn Cc: Matthew Brost Cc: Rakie Kim Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/migrate_device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index ab49d4dcdb60..19cd14b34114 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -840,7 +840,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, } else { if (folio_is_zone_device(folio) && !folio_is_device_coherent(folio)) { - goto abort; + goto free_abort; } entry = folio_mk_pmd(folio, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -893,6 +893,8 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, unlock_abort: spin_unlock(ptl); +free_abort: + pte_free(vma->vm_mm, pgtable); abort: for (i = 0; i < HPAGE_PMD_NR; i++) src[i] &= ~MIGRATE_PFN_MIGRATE; -- cgit v1.2.3 From f0af98ff6b3077278974a460becbd05bbc710e60 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Sat, 25 Apr 2026 16:06:48 +0300 Subject: MAINTAINERS, mailmap: change email for Eugen Hristev Replace old bouncing emails with ehristev@kernel.org Link: https://lore.kernel.org/20260425-eh-mailmap-v1-1-58788d401eef@kernel.org Signed-off-by: Eugen Hristev Signed-off-by: Andrew Morton --- .mailmap | 5 +++-- MAINTAINERS | 12 ++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.mailmap b/.mailmap index de41cfdae5d0..0b51bce3c05a 100644 --- a/.mailmap +++ b/.mailmap @@ -263,8 +263,9 @@ Enric Balletbo i Serra Enric Balletbo i Serra Erik Kaneda Ethan Carter Edwards Ethan Edwards -Eugen Hristev -Eugen Hristev +Eugen Hristev +Eugen Hristev +Eugen Hristev Evgeniy Polyakov Ezequiel Garcia Faith Ekstrand diff --git a/MAINTAINERS b/MAINTAINERS index 9eb15dacb939..8cf9ba51d981 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10832,7 +10832,7 @@ F: include/linux/generic-radix-tree.h F: lib/generic-radix-tree.c GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER -M: Eugen Hristev +M: Eugen Hristev L: linux-input@vger.kernel.org S: Maintained F: drivers/input/touchscreen/resistive-adc-touch.c @@ -17343,7 +17343,7 @@ F: Documentation/devicetree/bindings/sound/mikroe,mikroe-proto.txt F: sound/soc/atmel MICROCHIP CSI2DC DRIVER -M: Eugen Hristev +M: Eugen Hristev L: linux-media@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/media/microchip,csi2dc.yaml @@ -17370,7 +17370,7 @@ F: drivers/i2c/busses/i2c-at91-*.c F: drivers/i2c/busses/i2c-at91.h MICROCHIP ISC DRIVER -M: Eugen Hristev +M: Eugen Hristev L: linux-media@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/media/atmel,isc.yaml @@ -17382,7 +17382,7 @@ F: drivers/staging/media/deprecated/atmel/atmel-sama*-isc* F: include/linux/atmel-isc-media.h MICROCHIP ISI DRIVER -M: Eugen Hristev +M: Eugen Hristev L: linux-media@vger.kernel.org S: Supported F: drivers/media/platform/atmel/atmel-isi.c @@ -17572,7 +17572,7 @@ F: Documentation/devicetree/bindings/display/bridge/microchip,sam9x75-lvds.yaml F: drivers/gpu/drm/bridge/microchip-lvds.c MICROCHIP SAMA5D2-COMPATIBLE ADC DRIVER -M: Eugen Hristev +M: Eugen Hristev L: linux-iio@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/iio/adc/atmel,sama5d2-adc.yaml @@ -24123,7 +24123,7 @@ F: drivers/mmc/host/sdhci* SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) MICROCHIP DRIVER M: Aubin Constans -R: Eugen Hristev +R: Eugen Hristev L: linux-mmc@vger.kernel.org S: Supported F: drivers/mmc/host/sdhci-of-at91.c -- cgit v1.2.3 From 04aa71da5f35aacdc9ae9cb5150947daa624f641 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 15 May 2026 17:30:09 +0200 Subject: mm/vmalloc: do not trigger BUG() on BH disabled context __get_vm_area_node() currently triggers a BUG() if in_interrupt() returns true. However, in_interrupt() also reports true when BH are disabled. The bridge code can call rhashtable_lookup_insert_fast() with bottom halves disabled: __vlan_add() -> br_fdb_add_local() spin_lock_bh(&br->hash_lock); <-- Disable BH -> fdb_add_local() -> fdb_create() -> rhashtable_lookup_insert_fast() -> kvmalloc() -> vmalloc() -> __get_vm_area_node() -> BUG_ON(in_interrupt()) spin_unlock_bh(&br->hash_lock) this triggers the BUG() despite the caller not being in NMI or hard IRQ context. Replace the in_interrupt() check with in_nmi() || in_hardirq(). Link: https://lore.kernel.org/20260515153009.2296191-1-urezki@gmail.com Fixes: c6307674ed82 ("mm: kvmalloc: add non-blocking support for vmalloc") Signed-off-by: Uladzislau Rezki (Sony) Cc: Ido Schimmel Reported-by: syzbot+8b12fc6e0fb139765b58@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69ff8c7c.050a0220.1036b8.000b.GAE@google.com/ Reviewed-by: Baoquan He Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c31a8615a832..bb6ae08d18f5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3203,7 +3203,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; unsigned long requested_size = size; - BUG_ON(in_interrupt()); + BUG_ON(in_nmi() || in_hardirq()); size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; -- cgit v1.2.3 From 54cf41c969da6637cce790b7400da1451609db9b Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 15 May 2026 12:47:01 +0900 Subject: Revert "mm: introduce a new page type for page pool in page type" This reverts commit db359fccf212 ("mm: introduce a new page type for page pool in page type") and a part of 735a309b4bfb9e ("net: add net_iov_init() and use it to initialize ->page_type"). Netpp page_type'ed pages might be used in mapping so as to use @_mapcount. However, since @page_type and @_mapcount are union'ed in struct page, these two can't be used at the same time. Revert the commit introducing page_type for Netpp for now. The patch will be retried once @page_type and @_mapcount get allowed to be used at the same time. The revert also includes removal of @page_type initialization part introduced by commit 735a309b4bfb9e ("net: add net_iov_init() and use it to initialize ->page_type"), which will be restored on the retry. Link: https://lore.kernel.org/20260515034701.17027-1-byungchul@sk.com Fixes: db359fccf212 ("mm: introduce a new page type for page pool in page type") Signed-off-by: Byungchul Park Reported-by: Dragos Tatulea Closes: https://lore.kernel.org/all/982b9bc1-0a0a-4fc5-8e3a-3672db2b29a1@nvidia.com Acked-by: Jakub Kicinski Acked-by: David Hildenbrand (Arm) Acked-by: Harry Yoo (Oracle) Reviewed-by: Lorenzo Stoakes Cc: Alexei Starovoitov Cc: Baolin Wang Cc: Brendan Jackman Cc: David S. Miller Cc: Eric Dumazet Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Johannes Weiner Cc: John Fastabend Cc: Leon Romanovsky Cc: Liam R. Howlett Cc: Mark Bloch Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Paolo Abeni Cc: Pavel Begunkov Cc: Saeed Mahameed Cc: Simon Horman Cc: Stanislav Fomichev Cc: Suren Baghdasaryan Cc: Tariq Toukan Cc: Toke Hoiland-Jorgensen Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- include/linux/mm.h | 27 +++++++++++++++++++++--- include/linux/page-flags.h | 6 ------ include/net/netmem.h | 19 ++--------------- mm/page_alloc.c | 13 ++++-------- net/core/netmem_priv.h | 23 +++++++++++--------- net/core/page_pool.c | 24 ++------------------- 7 files changed, 46 insertions(+), 68 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 190b8b66b3ce..d3bab198c99c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -708,7 +708,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check PageNetpp() as we + /* No need to check page_pool_page_is_pp() as we * know this is a page_pool page. */ page_pool_recycle_direct(pp_page_to_nmdesc(page)->pp, diff --git a/include/linux/mm.h b/include/linux/mm.h index af23453e9dbd..06bbe9eba636 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5174,9 +5174,10 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. Non-PP pages can have - * arbitrary kernel pointers stored in the same field as pp_magic (since - * it overlaps with page->lru.next), so we must ensure that we cannot + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * @@ -5211,6 +5212,26 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(const struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(const struct page *page) +{ + return false; +} +#endif + #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0e03d816e8b9..7223f6f4e2b4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,7 +923,6 @@ enum pagetype { PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, - PGTY_netpp = 0xf9, PGTY_mapcount_underflow = 0xff }; @@ -1056,11 +1055,6 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) -/* - * Marks page_pool allocated pages. - */ -PAGE_TYPE_OPS(Netpp, netpp, netpp) - /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. diff --git a/include/net/netmem.h b/include/net/netmem.h index 78fe51e5756b..bccacd21b6c3 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -94,20 +94,10 @@ enum net_iov_type { */ struct net_iov { struct netmem_desc desc; - unsigned int page_type; enum net_iov_type type; struct net_iov_area *owner; }; -/* Make sure 'the offset of page_type in struct page == the offset of - * type in struct net_iov'. - */ -#define NET_IOV_ASSERT_OFFSET(pg, iov) \ - static_assert(offsetof(struct page, pg) == \ - offsetof(struct net_iov, iov)) -NET_IOV_ASSERT_OFFSET(page_type, page_type); -#undef NET_IOV_ASSERT_OFFSET - struct net_iov_area { /* Array of net_iovs for this area. */ struct net_iov *niovs; @@ -127,11 +117,7 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) return niov - net_iov_owner(niov)->niovs; } -/* Initialize a niov: stamp the owning area, the memory provider type, - * and the page_type "no type" sentinel expected by the page-type API - * (see PAGE_TYPE_OPS in ) so that - * page_pool_set_pp_info() can later call __SetPageNetpp() on a niov - * cast to struct page. +/* Initialize a niov: stamp the owning area, the memory provider type. */ static inline void net_iov_init(struct net_iov *niov, struct net_iov_area *owner, @@ -139,7 +125,6 @@ static inline void net_iov_init(struct net_iov *niov, { niov->owner = owner; niov->type = type; - niov->page_type = UINT_MAX; } /* netmem */ @@ -245,7 +230,7 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) */ #define pp_page_to_nmdesc(p) \ ({ \ - DEBUG_NET_WARN_ON_ONCE(!PageNetpp(p)); \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ __pp_page_to_nmdesc(p); \ }) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23c7298d3be2..d49c254174da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1035,6 +1035,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif + page_pool_page_is_pp(page) | (page->flags.f & check_flags))) return false; @@ -1061,6 +1062,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif + if (unlikely(page_pool_page_is_pp(page))) + bad_reason = "page_pool leak"; return bad_reason; } @@ -1377,17 +1380,9 @@ __always_inline bool __free_pages_prepare(struct page *page, mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); folio->mapping = NULL; } - if (unlikely(page_has_type(page))) { - /* networking expects to clear its page type before releasing */ - if (is_check_pages_enabled()) { - if (unlikely(PageNetpp(page))) { - bad_page(page, "page_pool leak"); - return false; - } - } + if (unlikely(page_has_type(page))) /* Reset the page_type (which overlays _mapcount) */ page->page_type = UINT_MAX; - } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 3e6fde8f1726..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,18 +8,21 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline bool netmem_is_pp(netmem_ref netmem) +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) { - struct page *page; + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - return PageNetpp(page); + netmem_to_nmdesc(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6e576dec80db..8171d1173221 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -707,18 +707,8 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { - struct page *page; - netmem_set_pp(netmem, pool); - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __SetPageNetpp(page); + netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -733,17 +723,7 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - struct page *page; - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __ClearPageNetpp(page); - + netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } -- cgit v1.2.3