summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2025-11-03 11:57:11 -1000
committerTejun Heo <tj@kernel.org>2025-11-03 11:57:26 -1000
commit587eb08a5fef911856c16c8bb444b2ab7f4f207f (patch)
treea5e73c037b5e6f5a6b56f613004cb82009eb5d48 /kernel
parent34423456443c5cf5ad5180155f7d40f96b836194 (diff)
parentd245698d727ab8f5420b3e28d1243f96a5234851 (diff)
sched_ext: Merge branch 'for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup into for-6.19
Pull cgroup/for-6.19 to receive: 16dad7801aad ("cgroup: Rename cgroup lifecycle hooks to cgroup_task_*()") 260fbcb92bbe ("cgroup: Move dying_tasks cleanup from cgroup_task_release() to cgroup_task_free()") d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") These are needed for the sched_ext cgroup exit ordering fix. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/helpers.c25
-rw-r--r--kernel/bpf/liveness.c4
-rw-r--r--kernel/bpf/syscall.c15
-rw-r--r--kernel/cgroup/cgroup.c39
-rw-r--r--kernel/cgroup/cpuset-internal.h3
-rw-r--r--kernel/cgroup/cpuset.c41
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched/autogroup.c4
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c26
14 files changed, 100 insertions, 84 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c9fab9a356df..8eb117c52817 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work)
rcu_read_unlock_trace();
}
+static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
+{
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+
+ kfree_nolock(cb);
+}
+
static void bpf_wq_delete_work(struct work_struct *work)
{
struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
cancel_work_sync(&w->work);
- kfree_rcu(w, cb.rcu);
+ call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
}
static void bpf_timer_delete_work(struct work_struct *work)
@@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call
- * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * call_rcu() right after for both preallocated and non-preallocated
* maps. The async->cb = NULL was already done and no code path can see
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
- * kmalloc_nolock() is available, avoid locking issues by using
- * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
- */
- cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
+ cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
* or pinned in bpffs.
*/
WRITE_ONCE(async->cb, NULL);
- kfree(cb);
+ kfree_nolock(cb);
ret = -EPERM;
}
out:
@@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer _before_ calling us, such that failing to cancel it here will
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
* Therefore, we _need_ to cancel any outstanding timers before we do
- * kfree_rcu, even though no more timers can be armed.
+ * call_rcu, even though no more timers can be armed.
*
* Moreover, we need to schedule work even if timer does not belong to
* the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val)
* completion.
*/
if (hrtimer_try_to_cancel(&t->timer) >= 0)
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
else
queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 3c611aba7f52..1e6538f59a78 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -195,8 +195,10 @@ static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
return ERR_PTR(-ENOMEM);
result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
GFP_KERNEL_ACCOUNT);
- if (!result->must_write_set)
+ if (!result->must_write_set) {
+ kvfree(result);
return ERR_PTR(-ENOMEM);
+ }
memcpy(&result->callchain, callchain, sizeof(*callchain));
result->insn_cnt = subprog_sz;
hash_add(liveness->func_instances, &result->hl_node, key);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a9456a3e730..8a129746bd6c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
return ptr;
}
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6ae5f48cf64e..aae180d56c8c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -944,7 +944,8 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit()/cgroup_free() dropping the css_set.
+ * against cgroup_task_dead()/cgroup_task_free() dropping
+ * the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -6972,19 +6973,29 @@ void cgroup_post_fork(struct task_struct *child,
}
/**
- * cgroup_exit - detach cgroup from exiting task
+ * cgroup_task_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk.
*
*/
-void cgroup_exit(struct task_struct *tsk)
+void cgroup_task_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
- struct css_set *cset;
int i;
- spin_lock_irq(&css_set_lock);
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+void cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
@@ -7002,15 +7013,10 @@ void cgroup_exit(struct task_struct *tsk)
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
-
- /* see cgroup_post_fork() for details */
- do_each_subsys_mask(ss, i, have_exit_callback) {
- ss->exit(tsk);
- } while_each_subsys_mask();
+ spin_unlock_irqrestore(&css_set_lock, flags);
}
-void cgroup_release(struct task_struct *task)
+void cgroup_task_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
int ssid;
@@ -7018,6 +7024,11 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
+}
+
+void cgroup_task_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
if (!list_empty(&task->cg_list)) {
spin_lock_irq(&css_set_lock);
@@ -7025,11 +7036,7 @@ void cgroup_release(struct task_struct *task)
list_del_init(&task->cg_list);
spin_unlock_irq(&css_set_lock);
}
-}
-void cgroup_free(struct task_struct *task)
-{
- struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 337608f408ce..5cac42c5fd97 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -155,9 +155,6 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of valid local child partitions */
- int nr_subparts;
-
/* partition root state */
int partition_root_state;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 185e820cd1df..96104710a649 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -358,8 +358,13 @@ static inline bool is_in_v2_mode(void)
* @excluded_child: a child cpuset to be excluded in task checking
* Return: true if there are tasks, false otherwise
*
- * It is assumed that @cs is a valid partition root. @excluded_child should
- * be non-NULL when this cpuset is going to become a partition itself.
+ * @cs should be a valid partition root or going to become a partition root.
+ * @excluded_child should be non-NULL when this cpuset is going to become a
+ * partition itself.
+ *
+ * Note that a remote partition is not allowed underneath a valid local
+ * or remote partition. So if a non-partition root child is populated,
+ * the whole partition is considered populated.
*/
static inline bool partition_is_populated(struct cpuset *cs,
struct cpuset *excluded_child)
@@ -369,8 +374,6 @@ static inline bool partition_is_populated(struct cpuset *cs,
if (cs->css.cgroup->nr_populated_csets)
return true;
- if (!excluded_child && !cs->nr_subparts)
- return cgroup_is_populated(cs->css.cgroup);
rcu_read_lock();
cpuset_for_each_child(child, css, cs) {
@@ -1302,7 +1305,6 @@ static void reset_partition_data(struct cpuset *cs)
lockdep_assert_held(&callback_lock);
- cs->nr_subparts = 0;
if (cpumask_empty(cs->exclusive_cpus)) {
cpumask_clear(cs->effective_xcpus);
if (is_cpu_exclusive(cs))
@@ -1746,7 +1748,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int deleting; /* Deleting cpus from parent's effective_cpus */
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
- int subparts_delta = 0;
int isolcpus_updated = 0;
struct cpumask *xcpus = user_xcpus(cs);
bool nocpu;
@@ -1771,10 +1772,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (is_partition_valid(parent))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
- if (old_prs > 0) {
+ if (old_prs > 0)
new_prs = -old_prs;
- subparts_delta--;
- }
+
goto write_error;
}
@@ -1829,7 +1829,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
deleting = true;
- subparts_delta++;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus back to parent's effective_cpus
@@ -1840,7 +1839,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (is_partition_valid(cs)) {
cpumask_copy(tmp->addmask, cs->effective_xcpus);
adding = true;
- subparts_delta--;
}
new_prs = PRS_MEMBER;
} else if (newmask) {
@@ -1963,17 +1961,13 @@ write_error:
switch (cs->partition_root_state) {
case PRS_ROOT:
case PRS_ISOLATED:
- if (part_error) {
+ if (part_error)
new_prs = -old_prs;
- subparts_delta--;
- }
break;
case PRS_INVALID_ROOT:
case PRS_INVALID_ISOLATED:
- if (!part_error) {
+ if (!part_error)
new_prs = -old_prs;
- subparts_delta++;
- }
break;
}
}
@@ -2002,11 +1996,9 @@ write_error:
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (old_prs != new_prs) {
+ if (old_prs != new_prs)
cs->partition_root_state = new_prs;
- if (new_prs <= 0)
- cs->nr_subparts = 0;
- }
+
/*
* Adding to parent's effective_cpus means deletion CPUs from cs
* and vice versa.
@@ -2018,10 +2010,6 @@ write_error:
isolcpus_updated += partition_xcpus_add(new_prs, parent,
tmp->delmask);
- if (is_partition_valid(parent)) {
- parent->nr_subparts += subparts_delta;
- WARN_ON_ONCE(parent->nr_subparts < 0);
- }
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
@@ -2105,8 +2093,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
*/
spin_lock_irq(&callback_lock);
make_partition_invalid(child);
- cs->nr_subparts--;
- child->nr_subparts = 0;
spin_unlock_irq(&callback_lock);
notify_partition_change(child, old_prs);
continue;
@@ -4021,7 +4007,6 @@ static void cpuset_handle_hotplug(void)
*/
if (!cpumask_empty(subpartitions_cpus)) {
if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
- top_cpuset.nr_subparts = 0;
cpumask_clear(subpartitions_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7541f6f85fcb..177e57c1a362 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9403,7 +9403,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_HUGETLB;
if (file) {
- struct inode *inode;
+ const struct inode *inode;
dev_t dev;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -9416,12 +9416,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = file_path(file, buf, PATH_MAX - sizeof(u64));
+ name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
}
- inode = file_inode(vma->vm_file);
+ inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
gen = inode->i_generation;
@@ -9492,7 +9492,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
if (!filter->path.dentry)
return false;
- if (d_inode(filter->path.dentry) != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_user_inode(file))
return false;
if (filter->offset > offset + size)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8709c69118b5..f11ceb8be8c4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2765,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
/*
* If user decided to take execution elsewhere, it makes little sense
* to execute the original instruction, so let's skip it.
@@ -2772,9 +2775,6 @@ static void handle_swbp(struct pt_regs *regs)
if (instruction_pointer(regs) != bp_vaddr)
goto out;
- /* Try to optimize after first hit. */
- arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
-
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..46173461e8de 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -257,7 +257,7 @@ repeat:
rcu_read_unlock();
pidfs_exit(p);
- cgroup_release(p);
+ cgroup_task_release(p);
/* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
thread_pid = task_pid(p);
@@ -967,7 +967,7 @@ void __noreturn do_exit(long code)
exit_thread(tsk);
sched_autogroup_exit_task(tsk);
- cgroup_exit(tsk);
+ cgroup_task_exit(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..960c39c9c264 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -738,7 +738,7 @@ void __put_task_struct(struct task_struct *tsk)
unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk);
- cgroup_free(tsk);
+ cgroup_task_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index cdea931aae30..954137775f38 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* this process can already run with task_group() == prev->tg or we can
* race with cgroup code which can read autogroup = prev under rq->lock.
* In the latter case for_each_thread() can not miss a migrating thread,
- * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
- * can't be removed from thread list, we hold ->siglock.
+ * cpu_cgroup_attach() must not be possible after cgroup_task_exit()
+ * and it can't be removed from thread list, we hold ->siglock.
*
* If an exiting thread was already removed from thread list we rely on
* sched_autogroup_exit_task().
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 096e8d03d85e..0324457622d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5151,6 +5151,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
+ cgroup_task_dead(prev);
+
/* Task is done with its stack. */
put_task_stack(prev);
@@ -8474,10 +8476,12 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
WARN(true, "Dying CPU not properly vacated!");
dump_rq_tasks(rq, KERN_WARNING);
}
+ dl_server_stop(&rq->fair_server);
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 48357d4609bf..6b8a9286e2fc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1582,6 +1582,9 @@ void dl_server_start(struct sched_dl_entity *dl_se)
if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
+ if (WARN_ON_ONCE(!cpu_online(cpu_of(rq))))
+ return;
+
dl_se->dl_server_active = 1;
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2554055c1ba1..b58e696d7ccc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8911,21 +8911,21 @@ simple:
return p;
idle:
- if (!rf)
- return NULL;
-
- new_tasks = sched_balance_newidle(rq, rf);
+ if (rf) {
+ new_tasks = sched_balance_newidle(rq, rf);
- /*
- * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
- * possible for any higher priority task to appear. In that case we
- * must re-start the pick_next_entity() loop.
- */
- if (new_tasks < 0)
- return RETRY_TASK;
+ /*
+ * Because sched_balance_newidle() releases (and re-acquires)
+ * rq->lock, it is possible for any higher priority task to
+ * appear. In that case we must re-start the pick_next_entity()
+ * loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
- if (new_tasks > 0)
- goto again;
+ if (new_tasks > 0)
+ goto again;
+ }
/*
* rq is about to be idle, check if we need to update the