summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2026-05-06 16:10:00 +0200
committerTakashi Iwai <tiwai@suse.de>2026-05-06 16:10:00 +0200
commit06bc7ff0a1e0f2b0102e1314e3527a7ec0997851 (patch)
treefc0fe1da457a988a463e8c883940e445873a28c4 /kernel
parent2bcbb163162789d3488562073dbb99d9bd71a762 (diff)
parent5776bcdf4dccac8edc1160482792b512da5c08b4 (diff)
Merge tag 'asoc-fix-v7.1-rc2' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v7.1 Another batch of fixes, plus a couple of quirks (mostly AMD ones, as has been the case recently). All driver changes, including fixes for the KUnit tests for the Cirrus drivers that could cause memory corruption.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c46
-rw-r--r--kernel/cgroup/cpuset-internal.h5
-rw-r--r--kernel/cgroup/cpuset.c13
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/futex/requeue.c13
-rw-r--r--kernel/liveupdate/kexec_handover.c21
-rw-r--r--kernel/liveupdate/luo_session.c15
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/ext.c398
-rw-r--r--kernel/sched/ext_idle.c20
-rw-r--r--kernel/sched/ext_idle.h1
-rw-r--r--kernel/sched/ext_internal.h2
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_probe.c6
-rw-r--r--kernel/trace/trace_probe.h4
17 files changed, 415 insertions, 194 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 43adc96c7f1a..45c0b1ed687a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3934,33 +3934,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
+ ctx = of->priv;
+ if (!ctx) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
- cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
- cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
- cgroup_put(cgrp);
+
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -5716,16 +5724,6 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
-
- css->cgroup->nr_dying_subsys[ss->id]++;
- /*
- * Parent css and cgroup cannot be freed until after the freeing
- * of child css, see css_free_rwork_fn().
- */
- while ((css = css->parent)) {
- css->nr_descendants--;
- css->cgroup->nr_dying_subsys[ss->id]++;
- }
}
/**
@@ -6038,6 +6036,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
*/
static void kill_css(struct cgroup_subsys_state *css)
{
+ struct cgroup_subsys *ss = css->ss;
+
lockdep_assert_held(&cgroup_mutex);
if (css->flags & CSS_DYING)
@@ -6074,6 +6074,16 @@ static void kill_css(struct cgroup_subsys_state *css)
* css is confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index fd7d19842ded..bb4e692bea30 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -168,6 +168,11 @@ struct cpuset {
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
u64 sum_migrate_dl_bw;
+ /*
+ * CPU used for temporary DL bandwidth allocation during attach;
+ * -1 if no DL bandwidth was allocated in the current attach.
+ */
+ int dl_bw_cpu;
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1335e437098e..e3a081a07c6d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -288,6 +288,7 @@ struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
+ .dl_bw_cpu = -1,
};
/**
@@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
+ trial->dl_bw_cpu = -1;
+
/* Setup cpumask pointer array */
cpumask_var_t *pmask[4] = {
&trial->cpus_allowed,
@@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs)
{
cs->nr_migrate_dl_tasks = 0;
cs->sum_migrate_dl_bw = 0;
+ cs->dl_bw_cpu = -1;
}
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
@@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
reset_migrate_dl_data(cs);
goto out_unlock;
}
+
+ cs->dl_bw_cpu = cpu;
}
out_success:
@@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
- if (cs->nr_migrate_dl_tasks) {
- int cpu = cpumask_any(cs->effective_cpus);
+ if (cs->dl_bw_cpu >= 0)
+ dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
- dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ if (cs->nr_migrate_dl_tasks)
reset_migrate_dl_data(cs);
- }
mutex_unlock(&cpuset_mutex);
}
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 9967fb25c563..4fdab4cf49e0 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
ret = PTR_ERR(rpool);
goto err;
} else {
- new = rpool->resources[index].usage + 1;
+ new = (s64)rpool->resources[index].usage + 1;
if (new > rpool->resources[index].max) {
ret = -EAGAIN;
goto err;
diff --git a/kernel/fork.c b/kernel/fork.c
index f1ad69c6dc2d..5f3fdfdb14c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1951,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p)
static bool need_futex_hash_allocate_default(u64 clone_flags)
{
- if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
- return false;
- return true;
+ /*
+ * Allocate a default futex hash for any sibling that will
+ * share the parent's mm, except vfork.
+ */
+ return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM;
}
/*
@@ -2380,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cancel_cgroup;
- /*
- * Allocate a default futex hash for the user process once the first
- * thread spawns.
- */
if (need_futex_hash_allocate_default(clone_flags)) {
retval = futex_hash_allocate_default();
if (retval)
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index d818b4d47f1b..b597cb3d17fc 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -319,8 +319,11 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
return -EINVAL;
/* Ensure that this does not race against an early wakeup */
- if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ if (!futex_requeue_pi_prepare(top_waiter, NULL)) {
+ plist_del(&top_waiter->list, &hb1->chain);
+ futex_hb_waiters_dec(hb1);
return -EAGAIN;
+ }
/*
* Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
@@ -722,10 +725,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/*
* We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
+ * Conditionally unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &hb->chain);
- futex_hb_waiters_dec(hb);
+ if (!plist_node_empty(&q->list)) {
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+ }
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 94762de1fe5f..18509d8082ea 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -762,19 +762,24 @@ int kho_add_subtree(const char *name, void *blob, size_t size)
goto out_pack;
}
- err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME,
- &phys, sizeof(phys));
- if (err < 0)
- goto out_pack;
+ fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME,
+ &phys, sizeof(phys));
+ if (fdt_err < 0)
+ goto out_del_node;
- err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME,
- &size_u64, sizeof(size_u64));
- if (err < 0)
- goto out_pack;
+ fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME,
+ &size_u64, sizeof(size_u64));
+ if (fdt_err < 0)
+ goto out_del_node;
WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob,
size, false));
+ err = 0;
+ goto out_pack;
+
+out_del_node:
+ fdt_del_node(root_fdt, off);
out_pack:
fdt_pack(root_fdt);
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
index a3327a28fc1f..7a42385dabe2 100644
--- a/kernel/liveupdate/luo_session.c
+++ b/kernel/liveupdate/luo_session.c
@@ -514,11 +514,12 @@ int luo_session_deserialize(void)
{
struct luo_session_header *sh = &luo_session_global.incoming;
static bool is_deserialized;
- static int err;
+ static int saved_err;
+ int err;
/* If has been deserialized, always return the same error code */
if (is_deserialized)
- return err;
+ return saved_err;
is_deserialized = true;
if (!sh->active)
@@ -547,7 +548,8 @@ int luo_session_deserialize(void)
pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n",
(int)sizeof(sh->ser[i].name),
sh->ser[i].name, session);
- return PTR_ERR(session);
+ err = PTR_ERR(session);
+ goto save_err;
}
err = luo_session_insert(sh, session);
@@ -555,7 +557,7 @@ int luo_session_deserialize(void)
pr_warn("Failed to insert session [%s] %pe\n",
session->name, ERR_PTR(err));
luo_session_free(session);
- return err;
+ goto save_err;
}
scoped_guard(mutex, &session->mutex) {
@@ -565,7 +567,7 @@ int luo_session_deserialize(void)
if (err) {
pr_warn("Failed to deserialize files for session [%s] %pe\n",
session->name, ERR_PTR(err));
- return err;
+ goto save_err;
}
}
@@ -574,6 +576,9 @@ int luo_session_deserialize(void)
sh->ser = NULL;
return 0;
+save_err:
+ saved_err = err;
+ return err;
}
int luo_session_serialize(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da20fb6ea25a..b8871449d3c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4458,6 +4458,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
+ p->se.rel_deadline = 0;
INIT_LIST_HEAD(&p->se.group_node);
/* A delayed task cannot be in clone(). */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e426e27b6794..345aa11b84b2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = {
.key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id),
.key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id),
.head_offset = offsetof(struct scx_sched, hash_node),
+ .insecure_elasticity = true, /* inserted under scx_sched_lock */
};
static struct rhashtable scx_sched_hash;
@@ -52,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
-static cpumask_var_t scx_bypass_lb_donee_cpumask;
-static cpumask_var_t scx_bypass_lb_resched_cpumask;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -469,24 +468,35 @@ static inline void update_locked_rq(struct rq *rq)
__this_cpu_write(scx_locked_rq_state, rq);
}
-#define SCX_CALL_OP(sch, op, rq, args...) \
+/*
+ * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
+ * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit.
+ */
+#define SCX_CALL_OP(sch, op, locked_rq, args...) \
do { \
- if (rq) \
- update_locked_rq(rq); \
+ struct rq *__prev_locked_rq; \
+ \
+ if (locked_rq) { \
+ __prev_locked_rq = scx_locked_rq(); \
+ update_locked_rq(locked_rq); \
+ } \
(sch)->ops.op(args); \
- if (rq) \
- update_locked_rq(NULL); \
+ if (locked_rq) \
+ update_locked_rq(__prev_locked_rq); \
} while (0)
-#define SCX_CALL_OP_RET(sch, op, rq, args...) \
+#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \
({ \
+ struct rq *__prev_locked_rq; \
__typeof__((sch)->ops.op(args)) __ret; \
\
- if (rq) \
- update_locked_rq(rq); \
+ if (locked_rq) { \
+ __prev_locked_rq = scx_locked_rq(); \
+ update_locked_rq(locked_rq); \
+ } \
__ret = (sch)->ops.op(args); \
- if (rq) \
- update_locked_rq(NULL); \
+ if (locked_rq) \
+ update_locked_rq(__prev_locked_rq); \
__ret; \
})
@@ -498,39 +508,39 @@ do { \
* those subject tasks.
*
* Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
- * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
- * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
- * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's
+ * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
+ * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
*
* kf_tasks[] can not stack, so task-based SCX ops must not nest. The
* WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
* while a previous one is still in progress.
*/
-#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \
do { \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP((sch), op, rq, task, ##args); \
+ SCX_CALL_OP((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \
({ \
__typeof__((sch)->ops.op(task, ##args)) __ret; \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \
({ \
__typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -1388,18 +1398,55 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
}
-static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
- bool preempt = false;
- call_task_dequeue(scx_root, rq, p, 0);
+ call_task_dequeue(sch, rq, p, 0);
+
+ /*
+ * Note that @rq's lock may be dropped between this enqueue and @p
+ * actually getting on CPU. This gives higher-class tasks (e.g. RT)
+ * an opportunity to wake up on @rq and prevent @p from running.
+ * Here are some concrete examples:
+ *
+ * Example 1:
+ *
+ * We dispatch two tasks from a single ops.dispatch():
+ * - First, a local task to this CPU's local DSQ;
+ * - Second, a local/remote task to a remote CPU's local DSQ.
+ * We must drop the local rq lock in order to finish the second
+ * dispatch. In that time, an RT task can wake up on the local rq.
+ *
+ * Example 2:
+ *
+ * We dispatch a local/remote task to a remote CPU's local DSQ.
+ * We must drop the remote rq lock before the dispatched task can run,
+ * which gives an RT task an opportunity to wake up on the remote rq.
+ *
+ * Both examples work the same if we replace dispatching with moving
+ * the tasks from a user-created DSQ.
+ *
+ * We must detect these wakeups so that we can re-enqueue IMMED tasks
+ * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this
+ * purpose, but for it to be invoked, we must ensure that we bump
+ * @rq->next_class to &ext_sched_class if it's currently idle.
+ *
+ * wakeup_preempt() does the bumping, and since we only invoke it if
+ * @rq->next_class is below &ext_sched_class, it will also
+ * resched_curr(rq).
+ */
+ if (sched_class_above(p->sched_class, rq->next_class))
+ wakeup_preempt(rq, p, 0);
/*
* If @rq is in balance, the CPU is already vacant and looking for the
* next task to run. No need to preempt or trigger resched after moving
* @p into its local DSQ.
+ * Note that the wakeup_preempt() above may have already triggered
+ * a resched if @rq->next_class was idle. It's harmless, since
+ * need_resched is cleared immediately after task pick.
*/
if (rq->scx.flags & SCX_RQ_IN_BALANCE)
return;
@@ -1407,11 +1454,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
- preempt = true;
- }
-
- if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
resched_curr(rq);
+ }
}
static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
@@ -1494,11 +1538,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
rcu_assign_pointer(dsq->first_task, p);
} else {
- bool was_empty;
-
- was_empty = list_empty(&dsq->list);
+ /*
+ * dsq->list can contain parked BPF iterator cursors, so
+ * list_empty() here isn't a reliable proxy for "no real
+ * task in the DSQ". Test dsq->first_task directly.
+ */
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
- if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
rcu_assign_pointer(dsq->first_task, p);
}
}
@@ -1518,7 +1564,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
* concurrently in a non-atomic way.
*/
if (is_local) {
- local_dsq_post_enq(dsq, p, enq_flags);
+ local_dsq_post_enq(sch, dsq, p, enq_flags);
} else {
/*
* Task on global/bypass DSQ: leave custody, task on
@@ -2129,7 +2175,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl
schedule_reenq_local(rq, 0);
}
-static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+static void move_local_task_to_local_dsq(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct rq *dst_rq)
{
@@ -2149,7 +2196,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
dsq_inc_nr(dst_dsq, p, enq_flags);
p->scx.dsq = dst_dsq;
- local_dsq_post_enq(dst_dsq, p, enq_flags);
+ local_dsq_post_enq(sch, dst_dsq, p, enq_flags);
}
/**
@@ -2370,7 +2417,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
/* @p is going from a non-local DSQ to a local DSQ */
if (src_rq == dst_rq) {
task_unlink_from_dsq(p, src_dsq);
- move_local_task_to_local_dsq(p, enq_flags,
+ move_local_task_to_local_dsq(sch, p, enq_flags,
src_dsq, dst_rq);
raw_spin_unlock(&src_dsq->lock);
} else {
@@ -2423,7 +2470,7 @@ retry:
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
- move_local_task_to_local_dsq(p, enq_flags, dsq, rq);
+ move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq);
raw_spin_unlock(&dsq->lock);
return true;
}
@@ -3183,7 +3230,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
!scx_bypassing(sch_a, task_cpu(a)))
return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before,
- NULL,
+ task_rq(a),
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3631,6 +3678,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch,
SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
}
+/*
+ * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never
+ * ran. The task state has not been transitioned, so this mirrors the
+ * SCX_TASK_INIT branch in __scx_disable_and_exit_task().
+ */
+static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p)
+{
+ struct scx_exit_task_args args = { .cancelled = true };
+
+ lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_rq_held(task_rq(p));
+
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
+}
+
static void scx_disable_and_exit_task(struct scx_sched *sch,
struct task_struct *p)
{
@@ -3639,11 +3702,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
/*
* If set, @p exited between __scx_init_task() and scx_enable_task() in
* scx_sub_enable() and is initialized for both the associated sched and
- * its parent. Disable and exit for the child too.
+ * its parent. Exit for the child too - scx_enable_task() never ran for
+ * it, so undo only init_task.
*/
- if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
- !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
- __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ if (!WARN_ON_ONCE(!scx_enabling_sub_sched))
+ scx_sub_init_cancel_task(scx_enabling_sub_sched, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
}
@@ -4324,9 +4388,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4339,9 +4404,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle);
@@ -4355,9 +4421,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
void scx_group_set_bandwidth(struct task_group *tg,
u64 period_us, u64 quota_us, u64 burst_us)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4380,21 +4447,6 @@ static struct cgroup *root_cgroup(void)
return &cgrp_dfl_root.cgrp;
}
-static struct cgroup *sch_cgroup(struct scx_sched *sch)
-{
- return sch->cgrp;
-}
-
-/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
-{
- struct cgroup *pos;
- struct cgroup_subsys_state *css;
-
- cgroup_for_each_live_descendant_pre(pos, css, cgrp)
- rcu_assign_pointer(pos->scx_sched, sch);
-}
-
static void scx_cgroup_lock(void)
{
#ifdef CONFIG_EXT_GROUP_SCHED
@@ -4412,12 +4464,30 @@ static void scx_cgroup_unlock(void)
}
#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
static struct cgroup *root_cgroup(void) { return NULL; }
-static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
static void scx_cgroup_lock(void) {}
static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+#ifdef CONFIG_EXT_SUB_SCHED
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+ return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+ struct cgroup *pos;
+ struct cgroup_subsys_state *css;
+
+ cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+ rcu_assign_pointer(pos->scx_sched, sch);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
/*
* Omitted operations:
*
@@ -4712,6 +4782,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
irq_work_sync(&sch->disable_irq_work);
kthread_destroy_worker(sch->helper);
timer_shutdown_sync(&sch->bypass_lb_timer);
+ free_cpumask_var(sch->bypass_lb_donee_cpumask);
+ free_cpumask_var(sch->bypass_lb_resched_cpumask);
#ifdef CONFIG_EXT_SUB_SCHED
kfree(sch->cgrp_path);
@@ -4938,6 +5010,25 @@ void scx_softlockup(u32 dur_s)
smp_processor_id(), dur_s);
}
+/*
+ * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(),
+ * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
+ * it from NMI context can lead to deadlocks. Defer via irq_work; the
+ * disable path runs off irq_work anyway.
+ */
+static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1);
+
+static void scx_hardlockup_irq_workfn(struct irq_work *work)
+{
+ int cpu = atomic_xchg(&scx_hardlockup_cpu, -1);
+
+ if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu))
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+}
+
+static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn);
+
/**
* scx_hardlockup - sched_ext hardlockup handler
*
@@ -4946,17 +5037,19 @@ void scx_softlockup(u32 dur_s)
* Try kicking out the current scheduler in an attempt to recover the system to
* a good state before taking more drastic actions.
*
- * Returns %true if sched_ext is enabled and abort was initiated, which may
- * resolve the reported hardlockup. %false if sched_ext is not enabled or
- * someone else already initiated abort.
+ * Queues an irq_work; the handle_lockup() call happens in IRQ context (see
+ * scx_hardlockup_irq_workfn).
+ *
+ * Returns %true if sched_ext is enabled and the work was queued, %false
+ * otherwise.
*/
bool scx_hardlockup(int cpu)
{
- if (!handle_lockup("hard lockup - CPU %d", cpu))
+ if (!rcu_access_pointer(scx_root))
return false;
- printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
- cpu);
+ atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu);
+ irq_work_queue(&scx_hardlockup_irq_work);
return true;
}
@@ -5000,6 +5093,15 @@ resume:
if (cpumask_empty(donee_mask))
break;
+ /*
+ * If an earlier pass placed @p on @donor_dsq from a different
+ * CPU and the donee hasn't consumed it yet, @p is still on the
+ * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved
+ * without its rq locked. Skip.
+ */
+ if (task_rq(p) != donor_rq)
+ continue;
+
donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
if (donee >= nr_cpu_ids)
continue;
@@ -5058,8 +5160,8 @@ resume:
static void bypass_lb_node(struct scx_sched *sch, int node)
{
const struct cpumask *node_mask = cpumask_of_node(node);
- struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
- struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask;
u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
u32 nr_target, nr_donor_target;
u32 before_min = U32_MAX, before_max = 0;
@@ -5698,6 +5800,8 @@ static void scx_sub_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
+ if (sch->sub_kset)
+ kset_unregister(sch->sub_kset);
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
@@ -5829,6 +5933,10 @@ static void scx_root_disable(struct scx_sched *sch)
* could observe an object of the same name still in the hierarchy when
* the next scheduler is loaded.
*/
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (sch->sub_kset)
+ kset_unregister(sch->sub_kset);
+#endif
kobject_del(&sch->kobj);
free_kick_syncs();
@@ -5921,6 +6029,25 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
irq_work_queue(&sch->disable_irq_work);
}
+/**
+ * scx_flush_disable_work - flush the disable work and wait for it to finish
+ * @sch: the scheduler
+ *
+ * sch->disable_work might still not queued, causing kthread_flush_work()
+ * as a noop. Syncing the irq_work first is required to guarantee the
+ * kthread work has been queued before waiting for it.
+ */
+static void scx_flush_disable_work(struct scx_sched *sch)
+{
+ int kind;
+
+ do {
+ irq_work_sync(&sch->disable_irq_work);
+ kthread_flush_work(&sch->disable_work);
+ kind = atomic_read(&sch->exit_kind);
+ } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE);
+}
+
static void dump_newline(struct seq_buf *s)
{
trace_sched_ext_dump("");
@@ -6032,9 +6159,8 @@ static void ops_dump_exit(void)
scx_dump_data.cpu = -1;
}
-static void scx_dump_task(struct scx_sched *sch,
- struct seq_buf *s, struct scx_dump_ctx *dctx,
- struct task_struct *p, char marker)
+static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct rq *rq, struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
struct scx_sched *task_sch = scx_task_sched(p);
@@ -6075,7 +6201,7 @@ static void scx_dump_task(struct scx_sched *sch,
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(sch, dump_task, NULL, dctx, p);
+ SCX_CALL_OP(sch, dump_task, rq, dctx, p);
ops_dump_exit();
}
@@ -6199,8 +6325,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(sch, dump_cpu, NULL,
- &dctx, cpu, idle);
+ SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle);
ops_dump_exit();
}
@@ -6223,11 +6348,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
if (rq->curr->sched_class == &ext_sched_class &&
(dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
- scx_dump_task(sch, &s, &dctx, rq->curr, '*');
+ scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
if (dump_all_tasks || scx_task_on_sched(sch, p))
- scx_dump_task(sch, &s, &dctx, p, ' ');
+ scx_dump_task(sch, &s, &dctx, rq, p, ' ');
next:
rq_unlock_irqrestore(rq, &rf);
}
@@ -6437,6 +6562,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
+
+ if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_stop_helper;
+ }
+ if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_free_lb_cpumask;
+ }
sch->ops = *ops;
rcu_assign_pointer(ops->priv, sch);
@@ -6446,14 +6580,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
- goto err_stop_helper;
+ goto err_free_lb_resched;
}
cgroup_path(cgrp, buf, PATH_MAX);
sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
kfree(buf);
if (!sch->cgrp_path) {
ret = -ENOMEM;
- goto err_stop_helper;
+ goto err_free_lb_resched;
}
sch->cgrp = cgrp;
@@ -6488,10 +6622,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
#endif /* CONFIG_EXT_SUB_SCHED */
return sch;
-#ifdef CONFIG_EXT_SUB_SCHED
+err_free_lb_resched:
+ free_cpumask_var(sch->bypass_lb_resched_cpumask);
+err_free_lb_cpumask:
+ free_cpumask_var(sch->bypass_lb_donee_cpumask);
err_stop_helper:
kthread_destroy_worker(sch->helper);
-#endif
err_free_pcpu:
for_each_possible_cpu(cpu) {
if (cpu == bypass_fail_cpu)
@@ -6510,7 +6646,7 @@ err_free_ei:
err_free_sch:
kfree(sch);
err_put_cgrp:
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
cgroup_put(cgrp);
#endif
return ERR_PTR(ret);
@@ -6601,7 +6737,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret)
goto err_unlock;
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
cgroup_get(cgrp);
#endif
sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
@@ -6639,8 +6775,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
rcu_assign_pointer(scx_root, sch);
ret = scx_link_sched(sch);
- if (ret)
+ if (ret) {
+ cpus_read_unlock();
goto err_disable;
+ }
scx_idle_enable(ops);
@@ -6821,7 +6959,7 @@ err_disable:
* completion. sch's base reference will be put by bpf_scx_unreg().
*/
scx_error(sch, "scx_root_enable() failed (%d)", ret);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
cmd->ret = 0;
}
@@ -7072,23 +7210,30 @@ out_unlock:
abort:
put_task_struct(p);
scx_task_iter_stop(&sti);
- scx_enabling_sub_sched = NULL;
+ /*
+ * Undo __scx_init_task() for tasks we marked. scx_enable_task() never
+ * ran for @sch on them, so calling scx_disable_task() here would invoke
+ * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched
+ * must stay set until SUB_INIT is cleared from every marked task -
+ * scx_disable_and_exit_task() reads it when a task exits concurrently.
+ */
scx_task_iter_start(&sti, sch->cgrp);
while ((p = scx_task_iter_next_locked(&sti))) {
if (p->scx.flags & SCX_TASK_SUB_INIT) {
- __scx_disable_and_exit_task(sch, p);
+ scx_sub_init_cancel_task(sch, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
}
}
scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
err_unlock_and_disable:
/* we'll soon enter disable path, keep bypass on */
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable:
mutex_unlock(&scx_enable_mutex);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
cmd->ret = 0;
}
@@ -7349,7 +7494,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
scx_disable(sch, SCX_EXIT_UNREG);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
}
@@ -8033,12 +8178,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
- struct scx_sched *sch = src_dsq->sched;
+ struct scx_sched *sch;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
bool in_balance;
unsigned long flags;
+ /*
+ * The verifier considers an iterator slot initialized on any
+ * KF_ITER_NEW return, so a BPF program may legally reach here after
+ * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL.
+ */
+ if (unlikely(!src_dsq))
+ return false;
+
+ sch = src_dsq->sched;
+
if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
return false;
@@ -8526,7 +8681,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice,
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!scx_task_on_sched(sch, p)))
+ if (unlikely(!sch || !scx_task_on_sched(sch, p)))
return false;
p->scx.slice = slice;
@@ -8549,7 +8704,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime,
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!scx_task_on_sched(sch, p)))
+ if (unlikely(!sch || !scx_task_on_sched(sch, p)))
return false;
p->scx.dsq_vtime = vtime;
@@ -8633,11 +8788,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux
/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Return the number of tasks in the DSQ matching @dsq_id. If not found,
* -%ENOENT is returned.
*/
-__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct scx_dispatch_q *dsq;
@@ -8645,7 +8801,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
preempt_disable();
- sch = rcu_dereference_sched(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch)) {
ret = -ENODEV;
goto out;
@@ -8677,21 +8833,21 @@ out:
/**
* scx_bpf_destroy_dsq - Destroy a custom DSQ
* @dsq_id: DSQ to destroy
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
* scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
* empty and no further tasks are dispatched to it. Ignored if called on a DSQ
* which doesn't exist. Can be called from any online scx_ops operations.
*/
-__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
if (sch)
destroy_dsq(sch, dsq_id);
- rcu_read_unlock();
}
/**
@@ -9445,8 +9601,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
-BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
@@ -9479,6 +9635,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
+ .filter = scx_kfunc_context_filter,
};
/*
@@ -9526,13 +9683,12 @@ static const u32 scx_kf_allow_flags[] = {
};
/*
- * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the
- * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this
- * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
+ * Verifier-time filter for SCX kfuncs. Registered via the .filter field on
+ * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc
+ * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
* BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the
- * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g.
- * scx_kfunc_ids_any) by falling through to "allow" when none of the
- * context-sensitive sets contain the kfunc.
+ * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by
+ * falling through to "allow" when none of the SCX sets contain the kfunc.
*/
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
@@ -9541,18 +9697,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
+ bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id);
+ bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id);
u32 moff, flags;
- /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */
- if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release))
+ /* Not an SCX kfunc - allow. */
+ if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch ||
+ in_cpu_release || in_idle || in_any))
return 0;
/* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */
if (prog->type == BPF_PROG_TYPE_SYSCALL)
- return (in_unlocked || in_select_cpu) ? 0 : -EACCES;
+ return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
- return -EACCES;
+ return (in_any || in_idle) ? 0 : -EACCES;
/*
* add_subprog_and_kfunc() collects all kfunc calls, including dead code
@@ -9565,14 +9724,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
return 0;
/*
- * Non-SCX struct_ops: only unlocked kfuncs are safe. The other
- * context-sensitive kfuncs assume the rq lock is held by the SCX
- * dispatch path, which doesn't apply to other struct_ops users.
+ * Non-SCX struct_ops: SCX kfuncs are not permitted.
*/
if (prog->aux->st_ops != &bpf_sched_ext_ops)
- return in_unlocked ? 0 : -EACCES;
+ return -EACCES;
/* SCX struct_ops: check the per-op allow list. */
+ if (in_any || in_idle)
+ return 0;
+
moff = prog->aux->attach_st_ops_member_off;
flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)];
@@ -9656,12 +9816,6 @@ static int __init scx_init(void)
return ret;
}
- if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
- !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
- pr_err("sched_ext: Failed to allocate cpumasks\n");
- return -ENOMEM;
- }
-
return 0;
}
__initcall(scx_init);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 443d12a3df67..7468560a6d80 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
* Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
* lock or @p's pi_lock. Three cases:
*
- * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
+ * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up
+ * task's pi_lock; the wake-up task is recorded in kf_tasks[0]
+ * by SCX_CALL_OP_TASK_RET().
* - other rq-locked SCX op: scx_locked_rq() points at the held rq.
* - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
* nothing held, take pi_lock ourselves.
+ *
+ * In the first two cases, BPF schedulers may pass an arbitrary task
+ * that the held lock doesn't cover. Refuse those.
*/
if (this_rq()->scx.in_select_cpu) {
+ if (!scx_kf_arg_task_ok(sch, p))
+ return -EINVAL;
lockdep_assert_held(&p->pi_lock);
- } else if (!scx_locked_rq()) {
+ } else if (scx_locked_rq()) {
+ if (task_rq(p) != scx_locked_rq())
+ goto cross_task;
+ } else {
raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
we_locked = true;
}
@@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
return cpu;
+
+cross_task:
+ scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]",
+ p->comm, p->pid);
+ return -EINVAL;
}
/**
@@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_idle,
+ .filter = scx_kfunc_context_filter,
};
/*
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index dc35f850481e..8d169d3bbdf9 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,6 +12,7 @@
struct sched_ext_ops;
+extern struct btf_id_set8 scx_kfunc_ids_idle;
extern struct btf_id_set8 scx_kfunc_ids_select_cpu;
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 62ce4eaf6a3f..a075732d4430 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1075,6 +1075,8 @@ struct scx_sched {
struct irq_work disable_irq_work;
struct kthread_work disable_work;
struct timer_list bypass_lb_timer;
+ cpumask_var_t bypass_lb_donee_cpumask;
+ cpumask_var_t bypass_lb_resched_cpumask;
struct rcu_work rcu_work;
/* all ancestors including self */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69361c63353a..728965851842 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -847,13 +847,19 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt
* Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
* is set.
*
- * Return true if the lag has been adjusted.
+ * Return true if the vlag has been modified. Specifically:
+ *
+ * se->vlag != avg_vruntime() - se->vruntime
+ *
+ * This can be due to clamping in entity_lag() or clamping due to
+ * sched_delayed. Either way, when vlag is modified and the entity is
+ * retained, the tree needs to be adjusted.
*/
static __always_inline
bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
- bool ret;
+ u64 avruntime = avg_vruntime(cfs_rq);
+ s64 vlag = entity_lag(cfs_rq, se, avruntime);
WARN_ON_ONCE(!se->on_rq);
@@ -863,10 +869,9 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (sched_feat(DELAY_ZERO))
vlag = min(vlag, 0);
}
- ret = (vlag == se->vlag);
se->vlag = vlag;
- return ret;
+ return avruntime - vlag != se->vruntime;
}
/*
@@ -1099,7 +1104,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -1170,11 +1175,6 @@ found:
return best;
}
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
- return __pick_eevdf(cfs_rq, true);
-}
-
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -5749,11 +5749,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
{
struct sched_entity *se;
- se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq, protect);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -9027,7 +9027,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
- struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
@@ -9138,11 +9138,17 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
}
pick:
+ nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
+ /* If @p has become the most eligible task, force preemption */
+ if (nse == pse)
+ goto preempt;
+
/*
- * If @p has become the most eligible task, force preemption.
+ * Because p is enqueued, nse being null can only mean that we
+ * dequeued a delayed task.
*/
- if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
- goto preempt;
+ if (!nse)
+ goto pick;
if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
@@ -9179,7 +9185,7 @@ again:
throttled |= check_cfs_rq_runtime(cfs_rq);
- se = pick_next_entity(rq, cfs_rq);
+ se = pick_next_entity(rq, cfs_rq, true);
if (!se)
goto again;
cfs_rq = group_cfs_rq(se);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6809b370e991..d1564db95a8f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&annotated_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"annotated branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
@@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&all_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"all branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e1c73065dae5..e0d3a0da26af 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1523,6 +1523,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
+ if (*size > MAX_PROBE_EVENT_SIZE) {
+ ret = -E2BIG;
+ trace_probe_log_err(ctx->offset, EVENT_TOO_BIG);
+ goto fail;
+ }
+
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 9fc56c937130..262d8707a3df 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,6 +38,7 @@
#define MAX_BTF_ARGS_LEN 128
#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
+#define MAX_PROBE_EVENT_SIZE 3072
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_TYPE4STR, "This type does not fit for string."),\
C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
- C(TOO_MANY_EARGS, "Too many entry arguments specified"),
+ C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
+ C(EVENT_TOO_BIG, "Event too big (too many fields?)"),
#undef C
#define C(a, b) TP_ERR_##a