From a766cfbbeb3a74397965a8fa2e9a402026d3e1d8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 12 Jun 2025 22:28:56 -0700 Subject: bpf: Mark dentry->d_inode as trusted_or_null LSM hooks such as security_path_mknod() and security_inode_rename() have access to newly allocated negative dentry, which has NULL d_inode. Therefore, it is necessary to do the NULL pointer check for d_inode. Also add selftests that checks the verifier enforces the NULL pointer check. Signed-off-by: Song Liu Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20250613052857.1992233-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..169845710c7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7027,8 +7027,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -7066,7 +7065,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -7076,6 +7074,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); -- cgit v1.2.3 From d4adf1c9ee7722545450608bcb095fb31512f0c6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 18 Jun 2025 17:57:40 -0400 Subject: bpf: Adjust free target to avoid global starvation of LRU map BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov Signed-off-by: Willem de Bruijn Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lru_list.c | 9 ++++++--- kernel/bpf/bpf_lru_list.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; -- cgit v1.2.3 From 12b9a2c05d1b474518b0f5fac4a50b7f93b16930 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Thu, 5 Jun 2025 19:11:41 +0200 Subject: kho: initialize tail pages for higher order folios properly Currently, when restoring higher order folios, kho_restore_folio() only calls prep_compound_page() on all the pages. That is not enough to properly initialize the folios. The managed page count does not get updated, the reserved flag does not get dropped, and page count does not get initialized properly. Restoring a higher order folio with it results in the following BUG with CONFIG_DEBUG_VM when attempting to free the folio: BUG: Bad page state in process test pfn:104e2b page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffffffffffffffff pfn:0x104e2b flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 002fffff80000000 0000000000000000 00000000ffffffff 0000000000000000 raw: ffffffffffffffff 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: nonzero _refcount [...] Call Trace: dump_stack_lvl+0x4b/0x70 bad_page.cold+0x97/0xb2 __free_frozen_pages+0x616/0x850 [...] Combine the path for 0-order and higher order folios, initialize the tail pages with a count of zero, and call adjust_managed_page_count() to account for all the pages instead of just missing them. In addition, since all the KHO-preserved pages get marked with MEMBLOCK_RSRV_NOINIT by deserialize_bitmap(), the reserved flag is not actually set (as can also be seen from the flags of the dumped page in the logs above). So drop the ClearPageReserved() calls. [ptyadav@amazon.de: declare i in the loop instead of at the top] Link: https://lkml.kernel.org/r/20250613125916.39272-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20250605171143.76963-1-pratyush@kernel.org Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..5a21dbe17950 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } /* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page) +static void kho_restore_page(struct page *page, unsigned int order) { - ClearPageReserved(page); - init_page_count(page); - adjust_managed_page_count(page, 1); + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); } /** @@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys) return NULL; order = page->private; - if (order) { - if (order > MAX_PAGE_ORDER) - return NULL; - - prep_compound_page(page, order); - } else { - kho_restore_page(page); - } + if (order > MAX_PAGE_ORDER) + return NULL; + kho_restore_page(page, order); return page_folio(page); } EXPORT_SYMBOL_GPL(kho_restore_folio); -- cgit v1.2.3 From 7484e15dbb016d9d40f8c6e0475810212ae181db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 17 Jun 2025 00:09:51 -0400 Subject: replace collect_mounts()/drop_collected_mounts() with a safer variant collect_mounts() has several problems - one can't iterate over the results directly, so it has to be done with callback passed to iterate_mounts(); it has an oopsable race with d_invalidate(); it creates temporary clones of mounts invisibly for sync umount (IOW, you can have non-lazy umount succeed leaving filesystem not mounted anywhere and yet still busy). A saner approach is to give caller an array of struct path that would pin every mount in a subtree, without cloning any mounts. * collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone * collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or a pointer to array of struct path, one for each chunk of tree visible under 'where' (i.e. the first element is a copy of where, followed by (mount,root) for everything mounted under it - the same set collect_mounts() would give). Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning references to the roots of subtrees in the caller's namespace. Array is terminated by {NULL, NULL} struct path. If it fits into preallocated array (on-stack, normally), that's where it goes; otherwise it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated' is ignored (and expected to be NULL). * drop_collected_paths(paths, preallocated) is given the array returned by an earlier call of collect_paths() and the preallocated array passed to that call. All mount/dentry references are dropped and array is kfree'd if it's not equal to 'preallocated'. * instead of iterate_mounts(), users should just iterate over array of struct path - nothing exotic is needed for that. Existing users (all in audit_tree.c) are converted. [folded a fix for braino reported by Venkat Rao Bagalkote ] Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry") Tested-by: Venkat Rao Bagalkote Signed-off-by: Al Viro --- kernel/audit_tree.c | 63 +++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } -- cgit v1.2.3 From 2eb7648558a7911f2208e8940cd22ca40e93cc76 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 19 Jun 2025 16:06:02 +0200 Subject: bpf: Specify access type of bpf_sysctl_get_name args The second argument of bpf_sysctl_get_name() helper is a pointer to a buffer that is being written to. However that isn't specify in the prototype. Until commit 37cce22dbd51a ("bpf: verifier: Refactor helper access type tracking"), all helper accesses were considered as a possible write access by the verifier, so no big harm was done. However, since then, the verifier might make wrong asssumption about the content of that address which might lead it to make faulty optimizations (such as removing code that was wrongly labeled dead). This is what happens in test_sysctl selftest to the tests related to sysctl_get_name. Add MEM_WRITE flag the second argument of bpf_sysctl_get_name(). Signed-off-by: Jerome Marchand Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250619140603.148942-2-jmarchan@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..f4885514f007 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2134,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; -- cgit v1.2.3 From b6f5e748587063d4ad3294d07ca29886851d9cd7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:21:22 +0200 Subject: crashdump: add CONFIG_KEYS dependency The dm_crypt code fails to build without CONFIG_KEYS: kernel/crash_dump_dm_crypt.c: In function 'restore_dm_crypt_keys_to_thread_keyring': kernel/crash_dump_dm_crypt.c:105:9: error: unknown type name 'key_ref_t'; did you mean 'key_ref_put'? There is a mix of 'select KEYS' and 'depends on KEYS' in Kconfig, so there is no single obvious solution here, but generally using 'depends on' makes more sense and is less likely to cause dependency loops. Link: https://lkml.kernel.org/r/20250620112140.3396316-1-arnd@kernel.org Fixes: 62f17d9df692 ("crash_dump: retrieve dm crypt keys in kdump kernel") Signed-off-by: Arnd Bergmann Cc: Alexander Graf Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index e64ce21f9a80..2ee603a98813 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -134,6 +134,7 @@ config CRASH_DM_CRYPT depends on KEXEC_FILE depends on CRASH_DUMP depends on DM_CRYPT + depends on KEYS help With this option enabled, user space can intereact with /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys -- cgit v1.2.3 From 1476b218327b89bbb64c14619a2d34f0c320f2c3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 25 Jun 2025 18:07:37 +0100 Subject: perf/aux: Fix pending disable flow when the AUX ring buffer overruns If an AUX event overruns, the event core layer intends to disable the event by setting the 'pending_disable' flag. Unfortunately, the event is not actually disabled afterwards. In commit: ca6c21327c6a ("perf: Fix missing SIGTRAPs") the 'pending_disable' flag was changed to a boolean. However, the AUX event code was not updated accordingly. The flag ends up holding a CPU number. If this number is zero, the flag is taken as false and the IRQ work is never triggered. Later, with commit: 2b84def990d3 ("perf: Split __perf_pending_irq() out of perf_pending_irq()") a new IRQ work 'pending_disable_irq' was introduced to handle event disabling. The AUX event path was not updated to kick off the work queue. To fix this bug, when an AUX ring buffer overrun is detected, call perf_event_disable_inatomic() to initiate the pending disable flow. Also update the outdated comment for setting the flag, to reflect the boolean values (0 or 1). Fixes: 2b84def990d3 ("perf: Split __perf_pending_irq() out of perf_pending_irq()") Fixes: ca6c21327c6a ("perf: Fix missing SIGTRAPs") Signed-off-by: Leo Yan Signed-off-by: Ingo Molnar Reviewed-by: James Clark Reviewed-by: Yeoreum Yun Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Liang Kan Cc: Marco Elver Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250625170737.2918295-1-leo.yan@arm.com --- kernel/events/core.c | 6 +++--- kernel/events/ring_buffer.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1f746469fda5..7281230044d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7251,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d2aef87c7e9f..aa9a759e824f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } -- cgit v1.2.3 From 12ffc3b1513ebc1f11ae77d053948504a94a68a6 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 13 Jun 2025 16:43:44 -0500 Subject: PM: Restrict swap use to later in the suspend sequence Currently swap is restricted before drivers have had a chance to do their prepare() PM callbacks. Restricting swap this early means that if a driver needs to evict some content from memory into sawp in it's prepare callback, it won't be able to. On AMD dGPUs this can lead to failed suspends under memory pressure situations as all VRAM must be evicted to system memory or swap. Move the swap restriction to right after all devices have had a chance to do the prepare() callback. If there is any problem with the sequence, restore swap in the appropriate dpm resume callbacks or error handling paths. Closes: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362 Signed-off-by: Mario Limonciello Tested-by: Nat Wittstock Tested-by: Lucian Langa Link: https://patch.msgid.link/20250613214413.4127087-1-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 1 + kernel/power/hibernate.c | 3 --- kernel/power/power.h | 5 ----- kernel/power/suspend.c | 3 +-- 4 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9c59fa480b0b..3a9a9f240dbc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1136,6 +1136,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: + pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); Restore_console: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 519fb09de5e0..9216e3b91d3b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -423,7 +423,6 @@ int hibernation_snapshot(int platform_mode) } console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -559,7 +558,6 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); @@ -571,7 +569,6 @@ int hibernation_restore(int platform_mode) BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); - pm_restore_gfp_mask(); console_resume_all(); pm_restore_console(); return error; diff --git a/kernel/power/power.h b/kernel/power/power.h index cb1d71562002..7ccd709af93f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -239,11 +239,6 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); -void pm_restrict_gfp_mask(void); -void pm_restore_gfp_mask(void); -#else -static inline void pm_restrict_gfp_mask(void) {} -static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 76b141b9aac0..bb608b68fb30 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -540,6 +540,7 @@ int suspend_devices_and_enter(suspend_state_t state) return error; Recover_platform: + pm_restore_gfp_mask(); platform_recover(state); goto Resume_devices; } @@ -606,9 +607,7 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("suspend_enter"), state, false); pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); - pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); Finish: events_check_enabled = false; -- cgit v1.2.3 From 6921d1e07cb5eddec830801087b419194fde0803 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 24 Jun 2025 14:38:46 +0800 Subject: tracing: Fix filter logic error If the processing of the tr->events loop fails, the filter that has been added to filter_head will be released twice in free_filter_list(&head->rcu) and __free_filter(filter). After adding the filter of tr->events, add the filter to the filter_head process to avoid triggering uaf. Link: https://lore.kernel.org/tencent_4EF87A626D702F816CD0951CE956EC32CD0A@qq.com Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization") Reported-by: syzbot+daba72c4af9915e9c894@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=daba72c4af9915e9c894 Tested-by: syzbot+daba72c4af9915e9c894@syzkaller.appspotmail.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Edward Adam Davis Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 08141f105c95..3885aadc434d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1436,13 +1436,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, INIT_LIST_HEAD(&head->list); - item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) - goto free_now; - - item->filter = filter; - list_add_tail(&item->list, &head->list); - list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; @@ -1454,6 +1447,13 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, event_clear_filter(file); } + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); return; free_now: -- cgit v1.2.3 From 7b4c5a37544ba22c6ebe72c0d4ea56c953459fa5 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Thu, 26 Jun 2025 13:54:03 +0000 Subject: perf/core: Fix the WARN_ON_ONCE is out of lock protected region commit 3172fb986666 ("perf/core: Fix WARN in perf_cgroup_switch()") try to fix a concurrency problem between perf_cgroup_switch and perf_cgroup_event_disable. But it does not to move the WARN_ON_ONCE into lock-protected region, so the warning is still be triggered. Fixes: 3172fb986666 ("perf/core: Fix WARN in perf_cgroup_switch()") Signed-off-by: Luo Gengkun Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250626135403.2454105-1-luogengkun@huaweicloud.com --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7281230044d0..bf2118c22126 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -951,8 +951,6 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; @@ -964,6 +962,8 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); -- cgit v1.2.3 From 3ebb1b6522392f64902b4e96954e35927354aa27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 26 Jun 2025 11:23:44 +0200 Subject: sched: Fix preemption string of preempt_dynamic_none MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero is a valid value for "preempt_dynamic_mode", namely "preempt_dynamic_none". Fix the off-by-one in preempt_model_str(), so that "preempty_dynamic_none" is correctly formatted as PREEMPT(none) instead of PREEMPT(undef). Fixes: 8bdc5daaa01e ("sched: Add a generic function to return the preemption string") Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Tested-by: Shrikanth Hegde Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250626-preempt-str-none-v2-1-526213b70a89@linutronix.de --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8988d38d46a3..cd80b66a960a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7663,7 +7663,7 @@ const char *preempt_model_str(void) if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { seq_buf_printf(&s, "(%s)%s", - preempt_dynamic_mode > 0 ? + preempt_dynamic_mode >= 0 ? preempt_modes[preempt_dynamic_mode] : "undef", brace ? "}" : ""); return seq_buf_str(&s); -- cgit v1.2.3 From 009836b4fa52f92cba33618e773b1094affa8cd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2025 12:00:09 +0200 Subject: sched/core: Fix migrate_swap() vs. hotplug On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote: > So, the potential race scenario is: > > CPU0 CPU1 > // doing migrate_swap(cpu0/cpu1) > stop_two_cpus() > ... > // doing _cpu_down() > sched_cpu_deactivate() > set_cpu_active(cpu, false); > balance_push_set(cpu, true); > cpu_stop_queue_two_works > __cpu_stop_queue_work(stopper1,...); > __cpu_stop_queue_work(stopper2,..); > stop_cpus_in_progress -> true > preempt_enable(); > ... > 1st balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 1st add push_work > wake_up_q(&wakeq); -> "wakeq is empty. > This implies that the stopper is at wakeq@migrate_swap." > preempt_disable > wake_up_q(&wakeq); > wake_up_process // wakeup migrate/0 > try_to_wake_up > ttwu_queue > ttwu_queue_cond ->meet below case > if (cpu == smp_processor_id()) > return false; > ttwu_do_activate > //migrate/0 wakeup done > wake_up_process // wakeup migrate/1 > try_to_wake_up > ttwu_queue > ttwu_queue_cond > ttwu_queue_wakelist > __ttwu_queue_wakelist > __smp_call_single_queue > preempt_enable(); > > 2nd balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 2nd add push_work, so the double list add is detected > ... > ... > cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1 > So this balance_push() is part of schedule(), and schedule() is supposed to switch to stopper task, but because of this race condition, stopper task is stuck in WAKING state and not actually visible to be picked. Therefore CPU1 can do another schedule() and end up doing another balance_push() even though the last one hasn't been done yet. This is a confluence of fail, where both wake_q and ttwu_wakelist can cause crucial wakeups to be delayed, resulting in the malfunction of balance_push. Since there is only a single stopper thread to be woken, the wake_q doesn't really add anything here, and can be removed in favour of direct wakeups of the stopper thread. Then add a clause to ttwu_queue_cond() to ensure the stopper threads are never queued / delayed. Of all 3 moving parts, the last addition was the balance_push() machinery, so pick that as the point the bug was introduced. Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Reported-by: Kuyo Chang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kuyo Chang Link: https://lkml.kernel.org/r/20250605100009.GO39944@noisy.programming.kicks-ass.net --- kernel/sched/core.c | 5 +++++ kernel/stop_machine.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd80b66a960a..ec68fc686bd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3943,6 +3943,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) if (!scx_allow_ttwu_queue(p)) return false; +#ifdef CONFIG_SMP + if (p->sched_class == &stop_sched_class) + return false; +#endif + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d2d0562115b..3fe6b0c99f3d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work, - struct wake_q_head *wakeq) + struct cpu_stop_work *work) { list_add_tail(&work->list, &stopper->works); - wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; @@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work, &wakeq); + __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); - wake_up_q(&wakeq); + if (enabled) + wake_up_process(stopper->thread); preempt_enable(); return enabled; @@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); - DEFINE_WAKE_Q(wakeq); int err; retry: @@ -300,8 +297,8 @@ retry: } err = 0; - __cpu_stop_queue_work(stopper1, work1, &wakeq); - __cpu_stop_queue_work(stopper2, work2, &wakeq); + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); unlock: raw_spin_unlock(&stopper2->lock); @@ -316,7 +313,10 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + wake_up_process(stopper1->thread); + wake_up_process(stopper2->thread); + } preempt_enable(); return err; -- cgit v1.2.3 From ba677dbe77af5ffe6204e0f3f547f3ba059c6302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Jul 2025 18:21:44 +0200 Subject: perf: Revert to requiring CAP_SYS_ADMIN for uprobes Jann reports that uprobes can be used destructively when used in the middle of an instruction. The kernel only verifies there is a valid instruction at the requested offset, but due to variable instruction length cannot determine if this is an instruction as seen by the intended execution stream. Additionally, Mark Rutland notes that on architectures that mix data in the text segment (like arm64), a similar things can be done if the data word is 'mistaken' for an instruction. As such, require CAP_SYS_ADMIN for uprobes. Fixes: c9e0924e5c2b ("perf/core: open access to probes for CAP_PERFMON privileged process") Reported-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAG48ez1n4520sq0XrWYDHKiKxE_+WCfAK+qt9qkY4ZiBGmL-5g@mail.gmail.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bf2118c22126..0db36b2b2448 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11116,7 +11116,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!perfmon_capable()) + if (!capable(CAP_SYS_ADMIN)) return -EACCES; /* -- cgit v1.2.3 From fc975cfb36393db1db517fbbe366e550bcdcff14 Mon Sep 17 00:00:00 2001 From: kuyo chang Date: Wed, 2 Jul 2025 10:12:25 +0800 Subject: sched/deadline: Fix dl_server runtime calculation formula In our testing with 6.12 based kernel on a big.LITTLE system, we were seeing instances of RT tasks being blocked from running on the LITTLE cpus for multiple seconds of time, apparently by the dl_server. This far exceeds the default configured 50ms per second runtime. This is due to the fair dl_server runtime calculation being scaled for frequency & capacity of the cpu. Consider the following case under a Big.LITTLE architecture: Assume the runtime is: 50,000,000 ns, and Frequency/capacity scale-invariance defined as below: Frequency scale-invariance: 100 Capacity scale-invariance: 50 First by Frequency scale-invariance, the runtime is scaled to 50,000,000 * 100 >> 10 = 4,882,812 Then by capacity scale-invariance, it is further scaled to 4,882,812 * 50 >> 10 = 238,418. So it will scaled to 238,418 ns. This smaller "accounted runtime" value is what ends up being subtracted against the fair-server's runtime for the current period. Thus after 50ms of real time, we've only accounted ~238us against the fair servers runtime. This 209:1 ratio in this example means that on the smaller cpu the fair server is allowed to continue running, blocking RT tasks, for over 10 seconds before it exhausts its supposed 50ms of runtime. And on other hardware configurations it can be even worse. For the fair deadline_server, to prevent realtime tasks from being unexpectedly delayed, we really do want to use fixed time, and not scaled time for smaller capacity/frequency cpus. So remove the scaling from the fair server's accounting to fix this. Fixes: a110a81c52a9 ("sched/deadline: Deferrable dl server") Suggested-by: Peter Zijlstra Suggested-by: John Stultz Signed-off-by: kuyo chang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Acked-by: John Stultz Tested-by: John Stultz Link: https://lore.kernel.org/r/20250702021440.2594736-1-kuyo.chang@mediatek.com --- kernel/sched/deadline.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..89019a140826 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1504,7 +1504,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 if (dl_entity_is_special(dl_se)) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + scaled_delta_exec = delta_exec; + if (!dl_server(dl_se)) + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); dl_se->runtime -= scaled_delta_exec; @@ -1611,7 +1613,7 @@ throttle: */ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) { - s64 delta_exec, scaled_delta_exec; + s64 delta_exec; if (!rq->fair_server.dl_defer) return; @@ -1624,9 +1626,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) if (delta_exec < 0) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); - - rq->fair_server.runtime -= scaled_delta_exec; + rq->fair_server.runtime -= delta_exec; if (rq->fair_server.runtime < 0) { rq->fair_server.dl_defer_running = 0; -- cgit v1.2.3