summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c286
-rw-r--r--kernel/cgroup/cpuset-internal.h5
-rw-r--r--kernel/cgroup/cpuset.c13
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/events/core.c70
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/futex/requeue.c13
-rw-r--r--kernel/liveupdate/kexec_handover.c21
-rw-r--r--kernel/liveupdate/luo_session.c15
-rw-r--r--kernel/rseq.c214
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/ext.c437
-rw-r--r--kernel/sched/ext_idle.c32
-rw-r--r--kernel/sched/ext_idle.h1
-rw-r--r--kernel/sched/ext_internal.h2
-rw-r--r--kernel/sched/fair.c82
-rw-r--r--kernel/sched/membarrier.c11
-rw-r--r--kernel/time/timer_migration.c40
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_probe.c6
-rw-r--r--kernel/trace/trace_probe.h4
-rw-r--r--kernel/workqueue.c29
24 files changed, 847 insertions, 460 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 43adc96c7f1a..6152add0c5eb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
+static void cgroup_finish_destroy(struct cgroup *cgrp);
+static void kill_css_sync(struct cgroup_subsys_state *css);
+static void kill_css_finish(struct cgroup_subsys_state *css);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
-static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
@@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (was_populated == cgroup_is_populated(cgrp))
break;
+ /*
+ * Subtree just emptied below an offlined cgrp. Fire deferred
+ * destroy. The transition is one-shot.
+ */
+ if (was_populated && !css_is_online(&cgrp->self)) {
+ cgroup_get(cgrp);
+ WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
+ &cgrp->finish_destroy_work));
+ }
+
cgroup1_check_for_release(cgrp);
TRACE_CGROUP_PATH(notify_populated, cgrp,
cgroup_is_populated(cgrp));
@@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0;
}
+static void cgroup_finish_destroy_work_fn(struct work_struct *work)
+{
+ struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
+
+ cgroup_lock();
+ cgroup_finish_destroy(cgrp);
+ cgroup_unlock();
+ cgroup_put(cgrp);
+}
+
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
- init_waitqueue_head(&cgrp->dying_populated_waitq);
+ INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
- kill_css(css);
+ kill_css_sync(css);
+ kill_css_finish(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
@@ -3934,33 +3957,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
+ ctx = of->priv;
+ if (!ctx) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
- cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
- cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
- cgroup_put(cgrp);
+
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -5059,10 +5090,12 @@ repeat:
task = list_entry(it->task_pos, struct task_struct, cg_list);
/*
- * Hide tasks that are exiting but not yet removed. Keep zombie
- * leaders with live threads visible.
+ * Hide tasks that are exiting but not yet removed by default. Keep
+ * zombie leaders with live threads visible. Usages that need to walk
+ * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD.
*/
- if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) &&
+ (task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
@@ -5506,7 +5539,7 @@ static struct cftype cgroup_psi_files[] = {
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
- * Implemented in kill_css().
+ * Implemented in kill_css_finish().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget_online() is guaranteed to fail, the css can be
@@ -5716,16 +5749,6 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
-
- css->cgroup->nr_dying_subsys[ss->id]++;
- /*
- * Parent css and cgroup cannot be freed until after the freeing
- * of child css, see css_free_rwork_fn().
- */
- while ((css = css->parent)) {
- css->nr_descendants--;
- css->cgroup->nr_dying_subsys[ss->id]++;
- }
}
/**
@@ -5995,7 +6018,7 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initiate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css_finish().
*/
static void css_killed_work_fn(struct work_struct *work)
{
@@ -6028,16 +6051,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
}
/**
- * kill_css - destroy a css
- * @css: css to destroy
+ * kill_css_sync - synchronous half of css teardown
+ * @css: css being killed
*
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference. ->css_offline() will be invoked
- * asynchronously once css_tryget_online() is guaranteed to fail and when
- * the reference count reaches zero, @css will be released.
+ * See cgroup_destroy_locked().
*/
-static void kill_css(struct cgroup_subsys_state *css)
+static void kill_css_sync(struct cgroup_subsys_state *css)
{
+ struct cgroup_subsys *ss = css->ss;
+
lockdep_assert_held(&cgroup_mutex);
if (css->flags & CSS_DYING)
@@ -6057,64 +6079,100 @@ static void kill_css(struct cgroup_subsys_state *css)
*/
css_clear_dir(css);
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
+}
+
+/**
+ * kill_css_finish - deferred half of css teardown
+ * @css: css being killed
+ *
+ * See cgroup_destroy_locked().
+ */
+static void kill_css_finish(struct cgroup_subsys_state *css)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Skip on re-entry: cgroup_apply_control_disable() may have killed @css
+ * earlier. cgroup_destroy_locked() can still walk it because
+ * offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
+ */
+ if (percpu_ref_is_dying(&css->refcnt))
+ return;
+
/*
- * Killing would put the base ref, but we need to keep it alive
- * until after ->css_offline().
+ * Killing would put the base ref, but we need to keep it alive until
+ * after ->css_offline().
*/
css_get(css);
/*
- * cgroup core guarantees that, by the time ->css_offline() is
- * invoked, no new css reference will be given out via
- * css_tryget_online(). We can't simply call percpu_ref_kill() and
- * proceed to offlining css's because percpu_ref_kill() doesn't
- * guarantee that the ref is seen as killed on all CPUs on return.
+ * cgroup core guarantees that, by the time ->css_offline() is invoked,
+ * no new css reference will be given out via css_tryget_online(). We
+ * can't simply call percpu_ref_kill() and proceed to offlining css's
+ * because percpu_ref_kill() doesn't guarantee that the ref is seen as
+ * killed on all CPUs on return.
*
- * Use percpu_ref_kill_and_confirm() to get notifications as each
- * css is confirmed to be seen as killed on all CPUs.
+ * Use percpu_ref_kill_and_confirm() to get notifications as each css is
+ * confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
/**
- * cgroup_destroy_locked - the first stage of cgroup destruction
+ * cgroup_destroy_locked - destroy @cgrp (called on rmdir)
* @cgrp: cgroup to be destroyed
*
- * css's make use of percpu refcnts whose killing latency shouldn't be
- * exposed to userland and are RCU protected. Also, cgroup core needs to
- * guarantee that css_tryget_online() won't succeed by the time
- * ->css_offline() is invoked. To satisfy all the requirements,
- * destruction is implemented in the following two steps.
- *
- * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
- * userland visible parts and start killing the percpu refcnts of
- * css's. Set up so that the next stage will be kicked off once all
- * the percpu refcnts are confirmed to be killed.
- *
- * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
- * rest of destruction. Once all cgroup references are gone, the
- * cgroup is RCU-freed.
- *
- * This function implements s1. After this step, @cgrp is gone as far as
- * the userland is concerned and a new cgroup with the same name may be
- * created. As cgroup doesn't care about the names internally, this
- * doesn't cause any problem.
+ * Tear down @cgrp on behalf of rmdir. Constraints:
+ *
+ * - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
+ *
+ * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
+ * subtree is still doing kernel work. A task hidden from cgroup.procs (past
+ * exit_signals() with signal->live cleared) can still schedule, allocate, and
+ * consume resources until its final context switch. Dying descendants in the
+ * subtree can host such tasks too.
+ *
+ * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
+ *
+ * The destruction runs in three parts:
+ *
+ * - This function: synchronous user-visible state teardown plus kill_css_sync()
+ * on each subsystem css.
+ *
+ * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
+ * each subsystem css. Fires once @cgrp's subtree is fully drained, either
+ * inline here or from cgroup_update_populated().
+ *
+ * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
+ * ->css_offline() -> release/free.
+ *
+ * Return 0 on success, -EBUSY if a userspace-visible task or an online child
+ * remains.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
- __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
- /*
- * Only migration can raise populated from zero and we're already
- * holding cgroup_mutex.
- */
- if (cgroup_is_populated(cgrp))
+ css_task_iter_start(&cgrp->self, 0, &it);
+ task = css_task_iter_next(&it);
+ css_task_iter_end(&it);
+ if (task)
return -EBUSY;
/*
@@ -6138,9 +6196,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
link->cset->dead = true;
spin_unlock_irq(&css_set_lock);
- /* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
- kill_css(css);
+ kill_css_sync(css);
/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
css_clear_dir(&cgrp->self);
@@ -6171,79 +6228,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
+ if (!cgroup_is_populated(cgrp))
+ cgroup_finish_destroy(cgrp);
+
return 0;
};
/**
- * cgroup_drain_dying - wait for dying tasks to leave before rmdir
- * @cgrp: the cgroup being removed
- *
- * cgroup.procs and cgroup.threads use css_task_iter which filters out
- * PF_EXITING tasks so that userspace doesn't see tasks that have already been
- * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
- * cgroup has non-empty css_sets - is only updated when dying tasks pass through
- * cgroup_task_dead() in finish_task_switch(). This creates a window where
- * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
- * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
- * tasks.
- *
- * This function aligns cgroup_has_tasks() with what userspace can observe. If
- * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
- * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
- * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
- *
- * This function only concerns itself with this cgroup's own dying tasks.
- * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
- *
- * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
- * retry the full check from scratch.
+ * cgroup_finish_destroy - deferred half of @cgrp destruction
+ * @cgrp: cgroup whose subtree just became empty
*
- * Must be called with cgroup_mutex held.
+ * See cgroup_destroy_locked() for the rationale.
*/
-static int cgroup_drain_dying(struct cgroup *cgrp)
- __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+static void cgroup_finish_destroy(struct cgroup *cgrp)
{
- struct css_task_iter it;
- struct task_struct *task;
- DEFINE_WAIT(wait);
+ struct cgroup_subsys_state *css;
+ int ssid;
lockdep_assert_held(&cgroup_mutex);
-retry:
- if (!cgroup_has_tasks(cgrp))
- return 0;
-
- /* Same iterator as cgroup.threads - if any task is visible, it's busy */
- css_task_iter_start(&cgrp->self, 0, &it);
- task = css_task_iter_next(&it);
- css_task_iter_end(&it);
- if (task)
- return -EBUSY;
-
- /*
- * All remaining tasks are PF_EXITING and will pass through
- * cgroup_task_dead() shortly. Wait for a kick and retry.
- *
- * cgroup_has_tasks() can't transition from false to true while we're
- * holding cgroup_mutex, but the true to false transition happens
- * under css_set_lock (via cgroup_task_dead()). We must retest and
- * prepare_to_wait() under css_set_lock. Otherwise, the transition
- * can happen between our first test and prepare_to_wait(), and we
- * sleep with no one to wake us.
- */
- spin_lock_irq(&css_set_lock);
- if (!cgroup_has_tasks(cgrp)) {
- spin_unlock_irq(&css_set_lock);
- return 0;
- }
- prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
- schedule();
- finish_wait(&cgrp->dying_populated_waitq, &wait);
- mutex_lock(&cgroup_mutex);
- goto retry;
+ for_each_css(css, ssid, cgrp)
+ kill_css_finish(css);
}
int cgroup_rmdir(struct kernfs_node *kn)
@@ -6255,12 +6260,9 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
- ret = cgroup_drain_dying(cgrp);
- if (!ret) {
- ret = cgroup_destroy_locked(cgrp);
- if (!ret)
- TRACE_CGROUP_PATH(rmdir, cgrp);
- }
+ ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ TRACE_CGROUP_PATH(rmdir, cgrp);
cgroup_kn_unlock(kn);
return ret;
@@ -7020,7 +7022,6 @@ void cgroup_task_exit(struct task_struct *tsk)
static void do_cgroup_task_dead(struct task_struct *tsk)
{
- struct cgrp_cset_link *link;
struct css_set *cset;
unsigned long flags;
@@ -7034,11 +7035,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
- /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
- if (waitqueue_active(&link->cgrp->dying_populated_waitq))
- wake_up(&link->cgrp->dying_populated_waitq);
-
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index fd7d19842ded..bb4e692bea30 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -168,6 +168,11 @@ struct cpuset {
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
u64 sum_migrate_dl_bw;
+ /*
+ * CPU used for temporary DL bandwidth allocation during attach;
+ * -1 if no DL bandwidth was allocated in the current attach.
+ */
+ int dl_bw_cpu;
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1335e437098e..e3a081a07c6d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -288,6 +288,7 @@ struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
+ .dl_bw_cpu = -1,
};
/**
@@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
+ trial->dl_bw_cpu = -1;
+
/* Setup cpumask pointer array */
cpumask_var_t *pmask[4] = {
&trial->cpus_allowed,
@@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs)
{
cs->nr_migrate_dl_tasks = 0;
cs->sum_migrate_dl_bw = 0;
+ cs->dl_bw_cpu = -1;
}
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
@@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
reset_migrate_dl_data(cs);
goto out_unlock;
}
+
+ cs->dl_bw_cpu = cpu;
}
out_success:
@@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
- if (cs->nr_migrate_dl_tasks) {
- int cpu = cpumask_any(cs->effective_cpus);
+ if (cs->dl_bw_cpu >= 0)
+ dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
- dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ if (cs->nr_migrate_dl_tasks)
reset_migrate_dl_data(cs);
- }
mutex_unlock(&cpuset_mutex);
}
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 9967fb25c563..4fdab4cf49e0 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
ret = PTR_ERR(rpool);
goto err;
} else {
- new = rpool->resources[index].usage + 1;
+ new = (s64)rpool->resources[index].usage + 1;
if (new > rpool->resources[index].max) {
ret = -EAGAIN;
goto err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6d1f8bad7e1c..7935d5663944 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7006,6 +7006,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
}
static void perf_pmu_output_stop(struct perf_event *event);
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb);
/*
* A buffer can be mmap()ed multiple times; either directly through the same
@@ -7021,8 +7022,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
- int mmap_locked = rb->mmap_locked;
- unsigned long size = perf_data_size(rb);
bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */
@@ -7117,11 +7116,7 @@ again:
* Aside from that, this buffer is 'fully' detached and unmapped,
* undo the VM accounting.
*/
-
- atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
- &mmap_user->locked_vm);
- atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
- free_uid(mmap_user);
+ perf_mmap_unaccount(vma, rb);
out_put:
ring_buffer_put(rb); /* could be last */
@@ -7261,6 +7256,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
atomic64_add(extra, &vma->vm_mm->pinned_vm);
}
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb)
+{
+ struct user_struct *user = rb->mmap_user;
+
+ atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked,
+ &user->locked_vm);
+ atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm);
+}
+
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
@@ -7323,8 +7327,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
if (!rb)
return -ENOMEM;
- refcount_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
@@ -7474,16 +7476,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
mapped(event, vma->vm_mm);
/*
- * Try to map it into the page table. On fail, invoke
- * perf_mmap_close() to undo the above, as the callsite expects
- * full cleanup in this case and therefore does not invoke
- * vmops::close().
+ * Try to map it into the page table. On fail undo the above,
+ * as the callsite expects full cleanup in this case and
+ * therefore does not invoke vmops::close().
*/
ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ if (likely(!ret))
+ return 0;
+
+ /* Error path */
+
+ /*
+ * If this is the first mmap(), then event->mmap_count should
+ * be stable at 1. It is only modified by:
+ * perf_mmap_{open,close}() and perf_mmap().
+ *
+ * The former are not possible because this mmap() hasn't been
+ * successful yet, and the latter is serialized by
+ * event->mmap_mutex which we still hold (note that mmap_lock
+ * is not strictly sufficient here, because the event fd can
+ * be passed to another process through trivial means like
+ * fork(), leading to concurrent mmap() from different mm).
+ *
+ * Make sure to remove event->rb before releasing
+ * event->mmap_mutex, such that any concurrent mmap() will not
+ * attempt use this failed buffer.
+ */
+ if (refcount_read(&event->mmap_count) == 1) {
+ /*
+ * Minimal perf_mmap_close(); there can't be AUX or
+ * other events on account of this being the first.
+ */
+ mapped = get_mapped(event, event_unmapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
+ perf_mmap_unaccount(vma, event->rb);
+ ring_buffer_attach(event, NULL); /* drops last rb->refcount */
+ refcount_set(&event->mmap_count, 0);
+ return ret;
+ }
+
+ /*
+ * Otherwise this is an already existing buffer, and there is
+ * no race vs first exposure, so fall-through and call
+ * perf_mmap_close().
+ */
}
+ perf_mmap_close(vma);
return ret;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d9cc57083091..c03c4f2eea57 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
struct perf_buffer *rb;
rb = container_of(rcu_head, struct perf_buffer, rcu_head);
+ free_uid(rb->mmap_user);
rb_free(rb);
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 3e7de2661417..9fe92161715e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
rb->paused = 1;
mutex_init(&rb->aux_mutex);
+ rb->mmap_user = get_current_user();
+ refcount_set(&rb->mmap_count, 1);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
diff --git a/kernel/fork.c b/kernel/fork.c
index f1ad69c6dc2d..5f3fdfdb14c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1951,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p)
static bool need_futex_hash_allocate_default(u64 clone_flags)
{
- if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
- return false;
- return true;
+ /*
+ * Allocate a default futex hash for any sibling that will
+ * share the parent's mm, except vfork.
+ */
+ return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM;
}
/*
@@ -2380,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cancel_cgroup;
- /*
- * Allocate a default futex hash for the user process once the first
- * thread spawns.
- */
if (need_futex_hash_allocate_default(clone_flags)) {
retval = futex_hash_allocate_default();
if (retval)
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index d818b4d47f1b..b597cb3d17fc 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -319,8 +319,11 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
return -EINVAL;
/* Ensure that this does not race against an early wakeup */
- if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ if (!futex_requeue_pi_prepare(top_waiter, NULL)) {
+ plist_del(&top_waiter->list, &hb1->chain);
+ futex_hb_waiters_dec(hb1);
return -EAGAIN;
+ }
/*
* Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
@@ -722,10 +725,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/*
* We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
+ * Conditionally unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &hb->chain);
- futex_hb_waiters_dec(hb);
+ if (!plist_node_empty(&q->list)) {
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+ }
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 94762de1fe5f..18509d8082ea 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -762,19 +762,24 @@ int kho_add_subtree(const char *name, void *blob, size_t size)
goto out_pack;
}
- err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME,
- &phys, sizeof(phys));
- if (err < 0)
- goto out_pack;
+ fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME,
+ &phys, sizeof(phys));
+ if (fdt_err < 0)
+ goto out_del_node;
- err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME,
- &size_u64, sizeof(size_u64));
- if (err < 0)
- goto out_pack;
+ fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME,
+ &size_u64, sizeof(size_u64));
+ if (fdt_err < 0)
+ goto out_del_node;
WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob,
size, false));
+ err = 0;
+ goto out_pack;
+
+out_del_node:
+ fdt_del_node(root_fdt, off);
out_pack:
fdt_pack(root_fdt);
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
index a3327a28fc1f..7a42385dabe2 100644
--- a/kernel/liveupdate/luo_session.c
+++ b/kernel/liveupdate/luo_session.c
@@ -514,11 +514,12 @@ int luo_session_deserialize(void)
{
struct luo_session_header *sh = &luo_session_global.incoming;
static bool is_deserialized;
- static int err;
+ static int saved_err;
+ int err;
/* If has been deserialized, always return the same error code */
if (is_deserialized)
- return err;
+ return saved_err;
is_deserialized = true;
if (!sh->active)
@@ -547,7 +548,8 @@ int luo_session_deserialize(void)
pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n",
(int)sizeof(sh->ser[i].name),
sh->ser[i].name, session);
- return PTR_ERR(session);
+ err = PTR_ERR(session);
+ goto save_err;
}
err = luo_session_insert(sh, session);
@@ -555,7 +557,7 @@ int luo_session_deserialize(void)
pr_warn("Failed to insert session [%s] %pe\n",
session->name, ERR_PTR(err));
luo_session_free(session);
- return err;
+ goto save_err;
}
scoped_guard(mutex, &session->mutex) {
@@ -565,7 +567,7 @@ int luo_session_deserialize(void)
if (err) {
pr_warn("Failed to deserialize files for session [%s] %pe\n",
session->name, ERR_PTR(err));
- return err;
+ goto save_err;
}
}
@@ -574,6 +576,9 @@ int luo_session_deserialize(void)
sh->ser = NULL;
return 0;
+save_err:
+ saved_err = err;
+ return err;
}
int luo_session_serialize(void)
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 38d3ef540760..e75e3a5e312c 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void)
}
__initcall(rseq_debugfs_init);
-static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
-{
- return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
-}
-
static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
{
struct rseq __user *urseq = t->rseq.usrptr;
@@ -258,14 +253,16 @@ efault:
static void rseq_slowpath_update_usr(struct pt_regs *regs)
{
/*
- * Preserve rseq state and user_irq state. The generic entry code
- * clears user_irq on the way out, the non-generic entry
- * architectures are not having user_irq.
+ * Preserve has_rseq and user_irq state. The generic entry code clears
+ * user_irq on the way out, the non-generic entry architectures are not
+ * setting user_irq.
*/
- const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
+ const struct rseq_event evt_mask = {
+ .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK,
+ .user_irq = true,
+ };
struct task_struct *t = current;
struct rseq_ids ids;
- u32 node_id;
bool event;
if (unlikely(t->flags & PF_EXITING))
@@ -301,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs)
if (!event)
return;
- node_id = cpu_to_node(ids.cpu_id);
+ ids.node_id = cpu_to_node(ids.cpu_id);
- if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+ if (unlikely(!rseq_update_usr(t, regs, &ids))) {
/*
* Clear the errors just in case this might survive magically, but
* leave the rest intact.
@@ -335,8 +332,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs)
void __rseq_signal_deliver(int sig, struct pt_regs *regs)
{
rseq_stat_inc(rseq_stats.signal);
+
/*
- * Don't update IDs, they are handled on exit to user if
+ * Don't update IDs yet, they are handled on exit to user if
* necessary. The important thing is to abort a critical section of
* the interrupted context as after this point the instruction
* pointer in @regs points to the signal handler.
@@ -349,6 +347,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs)
current->rseq.event.error = 0;
force_sigsegv(sig);
}
+
+ /*
+ * In legacy mode, force the update of IDs before returning to user
+ * space to stay compatible.
+ */
+ if (!rseq_v2(current))
+ rseq_force_update();
}
/*
@@ -384,19 +389,22 @@ void rseq_syscall(struct pt_regs *regs)
static bool rseq_reset_ids(void)
{
- struct rseq_ids ids = {
- .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
- .mm_cid = 0,
- };
+ struct rseq __user *rseq = current->rseq.usrptr;
/*
* If this fails, terminate it because this leaves the kernel in
* stupid state as exit to user space will try to fixup the ids
* again.
*/
- if (rseq_set_ids(current, &ids, 0))
- return true;
+ scoped_user_rw_access(rseq, efault) {
+ unsafe_put_user(0, &rseq->cpu_id_start, efault);
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+ unsafe_put_user(0, &rseq->node_id, efault);
+ unsafe_put_user(0, &rseq->mm_cid, efault);
+ }
+ return true;
+efault:
force_sig(SIGSEGV);
return false;
}
@@ -404,70 +412,29 @@ static bool rseq_reset_ids(void)
/* The original rseq structure size (including padding) is 32 bytes. */
#define ORIG_RSEQ_SIZE 32
-/*
- * sys_rseq - setup restartable sequences for caller thread.
- */
-SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
+static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
{
u32 rseqfl = 0;
+ u8 version = 1;
- if (flags & RSEQ_FLAG_UNREGISTER) {
- if (flags & ~RSEQ_FLAG_UNREGISTER)
- return -EINVAL;
- /* Unregister rseq for current thread. */
- if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
- return -EINVAL;
- if (rseq_len != current->rseq.len)
- return -EINVAL;
- if (current->rseq.sig != sig)
- return -EPERM;
- if (!rseq_reset_ids())
- return -EFAULT;
- rseq_reset(current);
- return 0;
- }
-
- if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
- return -EINVAL;
-
- if (current->rseq.usrptr) {
- /*
- * If rseq is already registered, check whether
- * the provided address differs from the prior
- * one.
- */
- if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
- return -EINVAL;
- if (current->rseq.sig != sig)
- return -EPERM;
- /* Already registered. */
- return -EBUSY;
- }
-
- /*
- * If there was no rseq previously registered, ensure the provided rseq
- * is properly aligned, as communcated to user-space through the ELF
- * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
- * size, the required alignment is the original struct rseq alignment.
- *
- * The rseq_len is required to be greater or equal to the original rseq
- * size. In order to be valid, rseq_len is either the original rseq size,
- * or large enough to contain all supported fields, as communicated to
- * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
- */
- if (rseq_len < ORIG_RSEQ_SIZE ||
- (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
- (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) ||
- rseq_len < offsetof(struct rseq, end))))
- return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
- if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
- rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
- if (rseq_slice_extension_enabled() &&
- (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
- rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ /*
+ * Architectures, which use the generic IRQ entry code (at least) enable
+ * registrations with a size greater than the original v1 fixed sized
+ * @rseq_len, which has been validated already to utilize the optimized
+ * v2 ABI mode which also enables extended RSEQ features beyond MMCID.
+ */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE)
+ version = 2;
+
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) {
+ if (rseq_slice_extension_enabled()) {
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ }
}
scoped_user_write_access(rseq, efault) {
@@ -485,7 +452,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
unsafe_put_user(0U, &rseq->node_id, efault);
unsafe_put_user(0U, &rseq->mm_cid, efault);
- unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+
+ /*
+ * All fields past mm_cid are only valid for non-legacy v2
+ * registrations.
+ */
+ if (version > 1) {
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ }
}
/*
@@ -501,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
#endif
/*
- * If rseq was previously inactive, and has just been
- * registered, ensure the cpu_id_start and cpu_id fields
- * are updated before returning to user-space.
+ * Ensure the cpu_id_start and cpu_id fields are updated before
+ * returning to user-space.
*/
- current->rseq.event.has_rseq = true;
+ current->rseq.event.has_rseq = version;
rseq_force_update();
return 0;
@@ -513,6 +487,80 @@ efault:
return -EFAULT;
}
+static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
+{
+ if (flags & ~RSEQ_FLAG_UNREGISTER)
+ return -EINVAL;
+ if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
+ return -EINVAL;
+ if (rseq_len != current->rseq.len)
+ return -EINVAL;
+ if (current->rseq.sig != sig)
+ return -EPERM;
+ if (!rseq_reset_ids())
+ return -EFAULT;
+ rseq_reset(current);
+ return 0;
+}
+
+static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig)
+{
+ /*
+ * If rseq is already registered, check whether the provided address
+ * differs from the prior one.
+ */
+ if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
+ return -EINVAL;
+ if (current->rseq.sig != sig)
+ return -EPERM;
+ /* Already registered. */
+ return -EBUSY;
+}
+
+static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len)
+{
+ /*
+ * Ensure the provided rseq is properly aligned, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If
+ * rseq_len is the original rseq size, the required alignment is the
+ * original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
+ */
+ if (rseq_len < ORIG_RSEQ_SIZE)
+ return false;
+
+ if (rseq_len == ORIG_RSEQ_SIZE)
+ return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE);
+
+ return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) &&
+ rseq_len >= offsetof(struct rseq, end);
+}
+
+#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
+
+/*
+ * sys_rseq - Register or unregister restartable sequences for the caller thread.
+ */
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
+{
+ if (flags & RSEQ_FLAG_UNREGISTER)
+ return rseq_unregister(rseq, rseq_len, flags, sig);
+
+ if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED))
+ return -EINVAL;
+
+ if (current->rseq.usrptr)
+ return rseq_reregister(rseq, rseq_len, sig);
+
+ if (!rseq_length_valid(rseq, rseq_len))
+ return -EINVAL;
+
+ return rseq_register(rseq, rseq_len, flags, sig);
+}
+
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
struct slice_timer {
struct hrtimer timer;
@@ -713,6 +761,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
return -ENOTSUPP;
if (!current->rseq.usrptr)
return -ENXIO;
+ if (!rseq_v2(current))
+ return -ENOTSUPP;
/* No change? */
if (enable == !!current->rseq.slice.state.enabled)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da20fb6ea25a..b8871449d3c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4458,6 +4458,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
+ p->se.rel_deadline = 0;
INIT_LIST_HEAD(&p->se.group_node);
/* A delayed task cannot be in clone(). */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e426e27b6794..38d90baf78cf 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = {
.key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id),
.key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id),
.head_offset = offsetof(struct scx_sched, hash_node),
+ .insecure_elasticity = true, /* inserted under scx_sched_lock */
};
static struct rhashtable scx_sched_hash;
@@ -52,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
-static cpumask_var_t scx_bypass_lb_donee_cpumask;
-static cpumask_var_t scx_bypass_lb_resched_cpumask;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -469,24 +468,35 @@ static inline void update_locked_rq(struct rq *rq)
__this_cpu_write(scx_locked_rq_state, rq);
}
-#define SCX_CALL_OP(sch, op, rq, args...) \
+/*
+ * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
+ * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit.
+ */
+#define SCX_CALL_OP(sch, op, locked_rq, args...) \
do { \
- if (rq) \
- update_locked_rq(rq); \
+ struct rq *__prev_locked_rq; \
+ \
+ if (locked_rq) { \
+ __prev_locked_rq = scx_locked_rq(); \
+ update_locked_rq(locked_rq); \
+ } \
(sch)->ops.op(args); \
- if (rq) \
- update_locked_rq(NULL); \
+ if (locked_rq) \
+ update_locked_rq(__prev_locked_rq); \
} while (0)
-#define SCX_CALL_OP_RET(sch, op, rq, args...) \
+#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \
({ \
+ struct rq *__prev_locked_rq; \
__typeof__((sch)->ops.op(args)) __ret; \
\
- if (rq) \
- update_locked_rq(rq); \
+ if (locked_rq) { \
+ __prev_locked_rq = scx_locked_rq(); \
+ update_locked_rq(locked_rq); \
+ } \
__ret = (sch)->ops.op(args); \
- if (rq) \
- update_locked_rq(NULL); \
+ if (locked_rq) \
+ update_locked_rq(__prev_locked_rq); \
__ret; \
})
@@ -498,39 +508,39 @@ do { \
* those subject tasks.
*
* Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
- * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
- * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
- * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's
+ * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
+ * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
*
* kf_tasks[] can not stack, so task-based SCX ops must not nest. The
* WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
* while a previous one is still in progress.
*/
-#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \
do { \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP((sch), op, rq, task, ##args); \
+ SCX_CALL_OP((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \
({ \
__typeof__((sch)->ops.op(task, ##args)) __ret; \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \
({ \
__typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -756,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
iter->cgrp = cgrp;
iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
- css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
+ &iter->css_iter);
return;
}
#endif
@@ -856,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
iter->css_pos = css_next_descendant_pre(iter->css_pos,
&iter->cgrp->self);
if (iter->css_pos)
- css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
+ &iter->css_iter);
}
return NULL;
}
@@ -916,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
*
* Test for idle_sched_class as only init_tasks are on it.
*/
- if (p->sched_class != &idle_sched_class)
- break;
- }
- if (!p)
- return NULL;
+ if (p->sched_class == &idle_sched_class)
+ continue;
- iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked_task = p;
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked_task = p;
- return p;
+ /*
+ * cgroup_task_dead() removes the dead tasks from cset->tasks
+ * after sched_ext_dead() and cgroup iteration may see tasks
+ * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS
+ * is set by sched_ext_dead() under @p's rq lock. Test it to
+ * avoid visiting tasks which are already dead from SCX POV.
+ */
+ if (p->scx.flags & SCX_TASK_OFF_TASKS) {
+ __scx_task_iter_rq_unlock(iter);
+ continue;
+ }
+
+ return p;
+ }
+ return NULL;
}
/**
@@ -1388,18 +1411,55 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
}
-static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
- bool preempt = false;
- call_task_dequeue(scx_root, rq, p, 0);
+ call_task_dequeue(sch, rq, p, 0);
+
+ /*
+ * Note that @rq's lock may be dropped between this enqueue and @p
+ * actually getting on CPU. This gives higher-class tasks (e.g. RT)
+ * an opportunity to wake up on @rq and prevent @p from running.
+ * Here are some concrete examples:
+ *
+ * Example 1:
+ *
+ * We dispatch two tasks from a single ops.dispatch():
+ * - First, a local task to this CPU's local DSQ;
+ * - Second, a local/remote task to a remote CPU's local DSQ.
+ * We must drop the local rq lock in order to finish the second
+ * dispatch. In that time, an RT task can wake up on the local rq.
+ *
+ * Example 2:
+ *
+ * We dispatch a local/remote task to a remote CPU's local DSQ.
+ * We must drop the remote rq lock before the dispatched task can run,
+ * which gives an RT task an opportunity to wake up on the remote rq.
+ *
+ * Both examples work the same if we replace dispatching with moving
+ * the tasks from a user-created DSQ.
+ *
+ * We must detect these wakeups so that we can re-enqueue IMMED tasks
+ * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this
+ * purpose, but for it to be invoked, we must ensure that we bump
+ * @rq->next_class to &ext_sched_class if it's currently idle.
+ *
+ * wakeup_preempt() does the bumping, and since we only invoke it if
+ * @rq->next_class is below &ext_sched_class, it will also
+ * resched_curr(rq).
+ */
+ if (sched_class_above(p->sched_class, rq->next_class))
+ wakeup_preempt(rq, p, 0);
/*
* If @rq is in balance, the CPU is already vacant and looking for the
* next task to run. No need to preempt or trigger resched after moving
* @p into its local DSQ.
+ * Note that the wakeup_preempt() above may have already triggered
+ * a resched if @rq->next_class was idle. It's harmless, since
+ * need_resched is cleared immediately after task pick.
*/
if (rq->scx.flags & SCX_RQ_IN_BALANCE)
return;
@@ -1407,11 +1467,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
- preempt = true;
- }
-
- if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
resched_curr(rq);
+ }
}
static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
@@ -1494,11 +1551,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
rcu_assign_pointer(dsq->first_task, p);
} else {
- bool was_empty;
-
- was_empty = list_empty(&dsq->list);
+ /*
+ * dsq->list can contain parked BPF iterator cursors, so
+ * list_empty() here isn't a reliable proxy for "no real
+ * task in the DSQ". Test dsq->first_task directly.
+ */
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
- if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
rcu_assign_pointer(dsq->first_task, p);
}
}
@@ -1518,7 +1577,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
* concurrently in a non-atomic way.
*/
if (is_local) {
- local_dsq_post_enq(dsq, p, enq_flags);
+ local_dsq_post_enq(sch, dsq, p, enq_flags);
} else {
/*
* Task on global/bypass DSQ: leave custody, task on
@@ -2129,7 +2188,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl
schedule_reenq_local(rq, 0);
}
-static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+static void move_local_task_to_local_dsq(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct rq *dst_rq)
{
@@ -2149,7 +2209,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
dsq_inc_nr(dst_dsq, p, enq_flags);
p->scx.dsq = dst_dsq;
- local_dsq_post_enq(dst_dsq, p, enq_flags);
+ local_dsq_post_enq(sch, dst_dsq, p, enq_flags);
}
/**
@@ -2370,7 +2430,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
/* @p is going from a non-local DSQ to a local DSQ */
if (src_rq == dst_rq) {
task_unlink_from_dsq(p, src_dsq);
- move_local_task_to_local_dsq(p, enq_flags,
+ move_local_task_to_local_dsq(sch, p, enq_flags,
src_dsq, dst_rq);
raw_spin_unlock(&src_dsq->lock);
} else {
@@ -2423,7 +2483,7 @@ retry:
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
- move_local_task_to_local_dsq(p, enq_flags, dsq, rq);
+ move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq);
raw_spin_unlock(&dsq->lock);
return true;
}
@@ -3183,7 +3243,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
!scx_bypassing(sch_a, task_cpu(a)))
return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before,
- NULL,
+ task_rq(a),
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3631,6 +3691,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch,
SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
}
+/*
+ * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never
+ * ran. The task state has not been transitioned, so this mirrors the
+ * SCX_TASK_INIT branch in __scx_disable_and_exit_task().
+ */
+static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p)
+{
+ struct scx_exit_task_args args = { .cancelled = true };
+
+ lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_rq_held(task_rq(p));
+
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
+}
+
static void scx_disable_and_exit_task(struct scx_sched *sch,
struct task_struct *p)
{
@@ -3639,11 +3715,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
/*
* If set, @p exited between __scx_init_task() and scx_enable_task() in
* scx_sub_enable() and is initialized for both the associated sched and
- * its parent. Disable and exit for the child too.
+ * its parent. Exit for the child too - scx_enable_task() never ran for
+ * it, so undo only init_task.
*/
- if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
- !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
- __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ if (!WARN_ON_ONCE(!scx_enabling_sub_sched))
+ scx_sub_init_cancel_task(scx_enabling_sub_sched, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
}
@@ -3784,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p)
/*
* @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
* ENABLED transitions can't race us. Disable ops for @p.
+ *
+ * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see
+ * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
+ * iteration is only used from sub-sched paths, which require root
+ * enabled. Root enable transitions every live task to at least READY.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
@@ -3791,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p)
rq = task_rq_lock(p, &rf);
scx_disable_and_exit_task(scx_task_sched(p), p);
+ p->scx.flags |= SCX_TASK_OFF_TASKS;
task_rq_unlock(rq, p, &rf);
}
}
@@ -4324,9 +4407,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4339,9 +4423,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle);
@@ -4355,9 +4440,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
void scx_group_set_bandwidth(struct task_group *tg,
u64 period_us, u64 quota_us, u64 burst_us)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
percpu_down_read(&scx_cgroup_ops_rwsem);
+ sch = scx_root;
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4380,21 +4466,6 @@ static struct cgroup *root_cgroup(void)
return &cgrp_dfl_root.cgrp;
}
-static struct cgroup *sch_cgroup(struct scx_sched *sch)
-{
- return sch->cgrp;
-}
-
-/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
-{
- struct cgroup *pos;
- struct cgroup_subsys_state *css;
-
- cgroup_for_each_live_descendant_pre(pos, css, cgrp)
- rcu_assign_pointer(pos->scx_sched, sch);
-}
-
static void scx_cgroup_lock(void)
{
#ifdef CONFIG_EXT_GROUP_SCHED
@@ -4412,12 +4483,30 @@ static void scx_cgroup_unlock(void)
}
#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
static struct cgroup *root_cgroup(void) { return NULL; }
-static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
static void scx_cgroup_lock(void) {}
static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+#ifdef CONFIG_EXT_SUB_SCHED
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+ return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+ struct cgroup *pos;
+ struct cgroup_subsys_state *css;
+
+ cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+ rcu_assign_pointer(pos->scx_sched, sch);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
/*
* Omitted operations:
*
@@ -4712,6 +4801,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
irq_work_sync(&sch->disable_irq_work);
kthread_destroy_worker(sch->helper);
timer_shutdown_sync(&sch->bypass_lb_timer);
+ free_cpumask_var(sch->bypass_lb_donee_cpumask);
+ free_cpumask_var(sch->bypass_lb_resched_cpumask);
#ifdef CONFIG_EXT_SUB_SCHED
kfree(sch->cgrp_path);
@@ -4938,6 +5029,25 @@ void scx_softlockup(u32 dur_s)
smp_processor_id(), dur_s);
}
+/*
+ * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(),
+ * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
+ * it from NMI context can lead to deadlocks. Defer via irq_work; the
+ * disable path runs off irq_work anyway.
+ */
+static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1);
+
+static void scx_hardlockup_irq_workfn(struct irq_work *work)
+{
+ int cpu = atomic_xchg(&scx_hardlockup_cpu, -1);
+
+ if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu))
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+}
+
+static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn);
+
/**
* scx_hardlockup - sched_ext hardlockup handler
*
@@ -4946,17 +5056,19 @@ void scx_softlockup(u32 dur_s)
* Try kicking out the current scheduler in an attempt to recover the system to
* a good state before taking more drastic actions.
*
- * Returns %true if sched_ext is enabled and abort was initiated, which may
- * resolve the reported hardlockup. %false if sched_ext is not enabled or
- * someone else already initiated abort.
+ * Queues an irq_work; the handle_lockup() call happens in IRQ context (see
+ * scx_hardlockup_irq_workfn).
+ *
+ * Returns %true if sched_ext is enabled and the work was queued, %false
+ * otherwise.
*/
bool scx_hardlockup(int cpu)
{
- if (!handle_lockup("hard lockup - CPU %d", cpu))
+ if (!rcu_access_pointer(scx_root))
return false;
- printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
- cpu);
+ atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu);
+ irq_work_queue(&scx_hardlockup_irq_work);
return true;
}
@@ -5000,6 +5112,15 @@ resume:
if (cpumask_empty(donee_mask))
break;
+ /*
+ * If an earlier pass placed @p on @donor_dsq from a different
+ * CPU and the donee hasn't consumed it yet, @p is still on the
+ * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved
+ * without its rq locked. Skip.
+ */
+ if (task_rq(p) != donor_rq)
+ continue;
+
donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
if (donee >= nr_cpu_ids)
continue;
@@ -5058,8 +5179,8 @@ resume:
static void bypass_lb_node(struct scx_sched *sch, int node)
{
const struct cpumask *node_mask = cpumask_of_node(node);
- struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
- struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask;
u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
u32 nr_target, nr_donor_target;
u32 before_min = U32_MAX, before_max = 0;
@@ -5698,6 +5819,8 @@ static void scx_sub_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
+ if (sch->sub_kset)
+ kset_unregister(sch->sub_kset);
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
@@ -5829,6 +5952,10 @@ static void scx_root_disable(struct scx_sched *sch)
* could observe an object of the same name still in the hierarchy when
* the next scheduler is loaded.
*/
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (sch->sub_kset)
+ kset_unregister(sch->sub_kset);
+#endif
kobject_del(&sch->kobj);
free_kick_syncs();
@@ -5921,6 +6048,25 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
irq_work_queue(&sch->disable_irq_work);
}
+/**
+ * scx_flush_disable_work - flush the disable work and wait for it to finish
+ * @sch: the scheduler
+ *
+ * sch->disable_work might still not queued, causing kthread_flush_work()
+ * as a noop. Syncing the irq_work first is required to guarantee the
+ * kthread work has been queued before waiting for it.
+ */
+static void scx_flush_disable_work(struct scx_sched *sch)
+{
+ int kind;
+
+ do {
+ irq_work_sync(&sch->disable_irq_work);
+ kthread_flush_work(&sch->disable_work);
+ kind = atomic_read(&sch->exit_kind);
+ } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE);
+}
+
static void dump_newline(struct seq_buf *s)
{
trace_sched_ext_dump("");
@@ -6032,9 +6178,8 @@ static void ops_dump_exit(void)
scx_dump_data.cpu = -1;
}
-static void scx_dump_task(struct scx_sched *sch,
- struct seq_buf *s, struct scx_dump_ctx *dctx,
- struct task_struct *p, char marker)
+static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct rq *rq, struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
struct scx_sched *task_sch = scx_task_sched(p);
@@ -6075,7 +6220,7 @@ static void scx_dump_task(struct scx_sched *sch,
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(sch, dump_task, NULL, dctx, p);
+ SCX_CALL_OP(sch, dump_task, rq, dctx, p);
ops_dump_exit();
}
@@ -6199,8 +6344,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(sch, dump_cpu, NULL,
- &dctx, cpu, idle);
+ SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle);
ops_dump_exit();
}
@@ -6223,11 +6367,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
if (rq->curr->sched_class == &ext_sched_class &&
(dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
- scx_dump_task(sch, &s, &dctx, rq->curr, '*');
+ scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
if (dump_all_tasks || scx_task_on_sched(sch, p))
- scx_dump_task(sch, &s, &dctx, p, ' ');
+ scx_dump_task(sch, &s, &dctx, rq, p, ' ');
next:
rq_unlock_irqrestore(rq, &rf);
}
@@ -6437,6 +6581,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
+
+ if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_stop_helper;
+ }
+ if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_free_lb_cpumask;
+ }
sch->ops = *ops;
rcu_assign_pointer(ops->priv, sch);
@@ -6446,14 +6599,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
- goto err_stop_helper;
+ goto err_free_lb_resched;
}
cgroup_path(cgrp, buf, PATH_MAX);
sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
kfree(buf);
if (!sch->cgrp_path) {
ret = -ENOMEM;
- goto err_stop_helper;
+ goto err_free_lb_resched;
}
sch->cgrp = cgrp;
@@ -6488,10 +6641,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
#endif /* CONFIG_EXT_SUB_SCHED */
return sch;
-#ifdef CONFIG_EXT_SUB_SCHED
+err_free_lb_resched:
+ free_cpumask_var(sch->bypass_lb_resched_cpumask);
+err_free_lb_cpumask:
+ free_cpumask_var(sch->bypass_lb_donee_cpumask);
err_stop_helper:
kthread_destroy_worker(sch->helper);
-#endif
err_free_pcpu:
for_each_possible_cpu(cpu) {
if (cpu == bypass_fail_cpu)
@@ -6510,7 +6665,7 @@ err_free_ei:
err_free_sch:
kfree(sch);
err_put_cgrp:
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
cgroup_put(cgrp);
#endif
return ERR_PTR(ret);
@@ -6601,7 +6756,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret)
goto err_unlock;
-#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+#ifdef CONFIG_EXT_SUB_SCHED
cgroup_get(cgrp);
#endif
sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
@@ -6639,8 +6794,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
rcu_assign_pointer(scx_root, sch);
ret = scx_link_sched(sch);
- if (ret)
+ if (ret) {
+ cpus_read_unlock();
goto err_disable;
+ }
scx_idle_enable(ops);
@@ -6821,7 +6978,7 @@ err_disable:
* completion. sch's base reference will be put by bpf_scx_unreg().
*/
scx_error(sch, "scx_root_enable() failed (%d)", ret);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
cmd->ret = 0;
}
@@ -7072,23 +7229,30 @@ out_unlock:
abort:
put_task_struct(p);
scx_task_iter_stop(&sti);
- scx_enabling_sub_sched = NULL;
+ /*
+ * Undo __scx_init_task() for tasks we marked. scx_enable_task() never
+ * ran for @sch on them, so calling scx_disable_task() here would invoke
+ * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched
+ * must stay set until SUB_INIT is cleared from every marked task -
+ * scx_disable_and_exit_task() reads it when a task exits concurrently.
+ */
scx_task_iter_start(&sti, sch->cgrp);
while ((p = scx_task_iter_next_locked(&sti))) {
if (p->scx.flags & SCX_TASK_SUB_INIT) {
- __scx_disable_and_exit_task(sch, p);
+ scx_sub_init_cancel_task(sch, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
}
}
scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
err_unlock_and_disable:
/* we'll soon enter disable path, keep bypass on */
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable:
mutex_unlock(&scx_enable_mutex);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
cmd->ret = 0;
}
@@ -7349,7 +7513,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
scx_disable(sch, SCX_EXIT_UNREG);
- kthread_flush_work(&sch->disable_work);
+ scx_flush_disable_work(sch);
RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
}
@@ -8033,12 +8197,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
- struct scx_sched *sch = src_dsq->sched;
+ struct scx_sched *sch;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
bool in_balance;
unsigned long flags;
+ /*
+ * The verifier considers an iterator slot initialized on any
+ * KF_ITER_NEW return, so a BPF program may legally reach here after
+ * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL.
+ */
+ if (unlikely(!src_dsq))
+ return false;
+
+ sch = src_dsq->sched;
+
if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
return false;
@@ -8526,7 +8700,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice,
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!scx_task_on_sched(sch, p)))
+ if (unlikely(!sch || !scx_task_on_sched(sch, p)))
return false;
p->scx.slice = slice;
@@ -8549,7 +8723,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime,
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!scx_task_on_sched(sch, p)))
+ if (unlikely(!sch || !scx_task_on_sched(sch, p)))
return false;
p->scx.dsq_vtime = vtime;
@@ -8633,11 +8807,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux
/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Return the number of tasks in the DSQ matching @dsq_id. If not found,
* -%ENOENT is returned.
*/
-__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct scx_dispatch_q *dsq;
@@ -8645,7 +8820,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
preempt_disable();
- sch = rcu_dereference_sched(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch)) {
ret = -ENODEV;
goto out;
@@ -8677,21 +8852,21 @@ out:
/**
* scx_bpf_destroy_dsq - Destroy a custom DSQ
* @dsq_id: DSQ to destroy
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
* scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
* empty and no further tasks are dispatched to it. Ignored if called on a DSQ
* which doesn't exist. Can be called from any online scx_ops operations.
*/
-__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
if (sch)
destroy_dsq(sch, dsq_id);
- rcu_read_unlock();
}
/**
@@ -9445,8 +9620,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
-BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
@@ -9479,6 +9654,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
+ .filter = scx_kfunc_context_filter,
};
/*
@@ -9526,13 +9702,12 @@ static const u32 scx_kf_allow_flags[] = {
};
/*
- * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the
- * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this
- * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
+ * Verifier-time filter for SCX kfuncs. Registered via the .filter field on
+ * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc
+ * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
* BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the
- * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g.
- * scx_kfunc_ids_any) by falling through to "allow" when none of the
- * context-sensitive sets contain the kfunc.
+ * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by
+ * falling through to "allow" when none of the SCX sets contain the kfunc.
*/
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
@@ -9541,18 +9716,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
+ bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id);
+ bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id);
u32 moff, flags;
- /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */
- if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release))
+ /* Not an SCX kfunc - allow. */
+ if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch ||
+ in_cpu_release || in_idle || in_any))
return 0;
/* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */
if (prog->type == BPF_PROG_TYPE_SYSCALL)
- return (in_unlocked || in_select_cpu) ? 0 : -EACCES;
+ return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
- return -EACCES;
+ return (in_any || in_idle) ? 0 : -EACCES;
/*
* add_subprog_and_kfunc() collects all kfunc calls, including dead code
@@ -9565,14 +9743,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
return 0;
/*
- * Non-SCX struct_ops: only unlocked kfuncs are safe. The other
- * context-sensitive kfuncs assume the rq lock is held by the SCX
- * dispatch path, which doesn't apply to other struct_ops users.
+ * Non-SCX struct_ops: SCX kfuncs are not permitted.
*/
if (prog->aux->st_ops != &bpf_sched_ext_ops)
- return in_unlocked ? 0 : -EACCES;
+ return -EACCES;
/* SCX struct_ops: check the per-op allow list. */
+ if (in_any || in_idle)
+ return 0;
+
moff = prog->aux->attach_st_ops_member_off;
flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)];
@@ -9656,12 +9835,6 @@ static int __init scx_init(void)
return ret;
}
- if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
- !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
- pr_err("sched_ext: Failed to allocate cpumasks\n");
- return -ENOMEM;
- }
-
return 0;
}
__initcall(scx_init);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 443d12a3df67..6e1980763270 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -466,12 +466,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
preempt_disable();
/*
- * Check whether @prev_cpu is still within the allowed set. If not,
- * we can still try selecting a nearby CPU.
- */
- is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
-
- /*
* Determine the subset of CPUs usable by @p within @cpus_allowed.
*/
if (allowed != p->cpus_ptr) {
@@ -488,6 +482,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
}
/*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
@@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
* Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
* lock or @p's pi_lock. Three cases:
*
- * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
+ * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up
+ * task's pi_lock; the wake-up task is recorded in kf_tasks[0]
+ * by SCX_CALL_OP_TASK_RET().
* - other rq-locked SCX op: scx_locked_rq() points at the held rq.
* - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
* nothing held, take pi_lock ourselves.
+ *
+ * In the first two cases, BPF schedulers may pass an arbitrary task
+ * that the held lock doesn't cover. Refuse those.
*/
if (this_rq()->scx.in_select_cpu) {
+ if (!scx_kf_arg_task_ok(sch, p))
+ return -EINVAL;
lockdep_assert_held(&p->pi_lock);
- } else if (!scx_locked_rq()) {
+ } else if (scx_locked_rq()) {
+ if (task_rq(p) != scx_locked_rq())
+ goto cross_task;
+ } else {
raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
we_locked = true;
}
@@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
return cpu;
+
+cross_task:
+ scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]",
+ p->comm, p->pid);
+ return -EINVAL;
}
/**
@@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_idle,
+ .filter = scx_kfunc_context_filter,
};
/*
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index dc35f850481e..8d169d3bbdf9 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,6 +12,7 @@
struct sched_ext_ops;
+extern struct btf_id_set8 scx_kfunc_ids_idle;
extern struct btf_id_set8 scx_kfunc_ids_select_cpu;
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 62ce4eaf6a3f..a075732d4430 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1075,6 +1075,8 @@ struct scx_sched {
struct irq_work disable_irq_work;
struct kthread_work disable_work;
struct timer_list bypass_lb_timer;
+ cpumask_var_t bypass_lb_donee_cpumask;
+ cpumask_var_t bypass_lb_resched_cpumask;
struct rcu_work rcu_work;
/* all ancestors including self */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69361c63353a..3ebec186f982 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -847,13 +847,19 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt
* Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
* is set.
*
- * Return true if the lag has been adjusted.
+ * Return true if the vlag has been modified. Specifically:
+ *
+ * se->vlag != avg_vruntime() - se->vruntime
+ *
+ * This can be due to clamping in entity_lag() or clamping due to
+ * sched_delayed. Either way, when vlag is modified and the entity is
+ * retained, the tree needs to be adjusted.
*/
static __always_inline
bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
- bool ret;
+ u64 avruntime = avg_vruntime(cfs_rq);
+ s64 vlag = entity_lag(cfs_rq, se, avruntime);
WARN_ON_ONCE(!se->on_rq);
@@ -863,10 +869,9 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (sched_feat(DELAY_ZERO))
vlag = min(vlag, 0);
}
- ret = (vlag == se->vlag);
se->vlag = vlag;
- return ret;
+ return avruntime - vlag != se->vruntime;
}
/*
@@ -877,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* lag_i >= 0 -> V >= v_i
*
- * \Sum (v_i - v)*w_i
- * V = ------------------ + v
+ * \Sum (v_i - v0)*w_i
+ * V = ------------------- + v0
* \Sum w_i
*
- * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i)
*
* Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
* to the loss in precision caused by the division.
@@ -889,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
+ s64 key, avg = cfs_rq->sum_w_vruntime;
long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
@@ -899,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
+ key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
+
+ /*
+ * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD'
+ * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24
+ * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise.
+ *
+ * This gives that on 64bit the product will be at least 64bit which
+ * overflows s64, while on 32bit it will only be 44bits and should fit
+ * comfortably.
+ */
+#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+ /* This often results in simpler code than __builtin_mul_overflow(). */
+ return avg >= (__int128)key * load;
+#else
+ s64 rhs;
+ /*
+ * On overflow, the sign of key tells us the correct answer: a large
+ * positive key means vruntime >> V, so not eligible; a large negative
+ * key means vruntime << V, so eligible.
+ */
+ if (check_mul_overflow(key, load, &rhs))
+ return key <= 0;
+
+ return avg >= rhs;
+#endif
+#else /* 32bit */
+ return avg >= key * load;
+#endif
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1099,7 +1133,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -1170,11 +1204,6 @@ found:
return best;
}
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
- return __pick_eevdf(cfs_rq, true);
-}
-
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -5749,11 +5778,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
{
struct sched_entity *se;
- se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq, protect);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -9027,7 +9056,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
- struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
@@ -9138,11 +9167,18 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
}
pick:
+ nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
+ /* If @p has become the most eligible task, force preemption */
+ if (nse == pse)
+ goto preempt;
+
/*
- * If @p has become the most eligible task, force preemption.
+ * Because p is enqueued, nse being null can only mean that we
+ * dequeued a delayed task. If there are still entities queued in
+ * cfs, check if the next one will be p.
*/
- if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
- goto preempt;
+ if (!nse && cfs_rq->nr_queued)
+ goto pick;
if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
@@ -9179,7 +9215,7 @@ again:
throttled |= check_cfs_rq_runtime(cfs_rq);
- se = pick_next_entity(rq, cfs_rq);
+ se = pick_next_entity(rq, cfs_rq, true);
if (!se)
goto again;
cfs_rq = group_cfs_rq(se);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 623445603725..226a6329f3e9 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,16 @@ static void ipi_rseq(void *info)
* is negligible.
*/
smp_mb();
- rseq_sched_switch_event(current);
+ /*
+ * Legacy mode requires that IDs are written and the critical section is
+ * evaluated. V2 optimized mode handles the critical section and IDs are
+ * only updated if they change as a consequence of preemption after
+ * return from this IPI.
+ */
+ if (rseq_v2(current))
+ rseq_sched_switch_event(current);
+ else
+ rseq_force_update();
}
static void ipi_sync_rq_state(void *info)
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 155eeaea4113..1d0d3a4058d5 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1860,19 +1860,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
* child to the new parents. So tmigr_active_up() activates the
* new parents while walking up from the old root to the new.
*
- * * It is ensured that @start is active, as this setup path is
- * executed in hotplug prepare callback. This is executed by an
- * already connected and !idle CPU. Even if all other CPUs go idle,
- * the CPU executing the setup will be responsible up to current top
- * level group. And the next time it goes inactive, it will release
- * the new childmask and parent to subsequent walkers through this
- * @child. Therefore propagate active state unconditionally.
+ * * It is ensured that @start is active, (or on the way to be activated
+ * by another CPU that woke up before the current one) as this setup path
+ * is executed in hotplug prepare callback. This is executed by an already
+ * connected and !idle CPU in the hierarchy.
+ *
+ * * The below RmW atomic operation ensures that:
+ *
+ * 1) If the old root has been completely activated, the latest state is
+ * acquired (the below implicit acquire pairs with the implicit release
+ * from cmpxchg() in tmigr_active_up()).
+ *
+ * 2) If the old root is still on the way to be activated, the lagging behind
+ * CPU performing the activation will acquire the links up to the new root.
+ * (The below implicit release pairs with the implicit acquire from cmpxchg()
+ * in tmigr_active_up()).
+ *
+ * 3) Every subsequent CPU below the old root will acquire the new links while
+ * walking through the old root (The below implicit release pairs with the
+ * implicit acquire from cmpxchg() in either tmigr_active_up()) or
+ * tmigr_inactive_up().
*/
- state.state = atomic_read(&start->migr_state);
- WARN_ON_ONCE(!state.active);
+ state.state = atomic_fetch_or(0, &start->migr_state);
WARN_ON_ONCE(!start->parent);
- data.childmask = start->groupmask;
- __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ /*
+ * If the state of the old root is inactive, another CPU is on its way to activate
+ * it and propagate to the new root.
+ */
+ if (state.active) {
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ }
}
/* Root update */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6809b370e991..d1564db95a8f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&annotated_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"annotated branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
@@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void)
int ret;
ret = register_stat_tracer(&all_branch_stats);
- if (!ret) {
+ if (ret) {
printk(KERN_WARNING "Warning: could not register "
"all branches stats\n");
- return 1;
+ return ret;
}
return 0;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e1c73065dae5..e0d3a0da26af 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1523,6 +1523,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
+ if (*size > MAX_PROBE_EVENT_SIZE) {
+ ret = -E2BIG;
+ trace_probe_log_err(ctx->offset, EVENT_TOO_BIG);
+ goto fail;
+ }
+
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 9fc56c937130..262d8707a3df 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,6 +38,7 @@
#define MAX_BTF_ARGS_LEN 128
#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
+#define MAX_PROBE_EVENT_SIZE 3072
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_TYPE4STR, "This type does not fit for string."),\
C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
- C(TOO_MANY_EARGS, "Too many entry arguments specified"),
+ C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
+ C(EVENT_TOO_BIG, "Event too big (too many fields?)"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f747f241a5f..3d2e3b2ec528 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5906,6 +5906,21 @@ err_destroy:
return NULL;
}
+__printf(1, 0)
+static struct workqueue_struct *alloc_workqueue_va(const char *fmt,
+ unsigned int flags,
+ int max_active,
+ va_list args)
+{
+ struct workqueue_struct *wq;
+
+ wq = __alloc_workqueue(fmt, flags, max_active, args);
+ if (wq)
+ wq_init_lockdep(wq);
+
+ return wq;
+}
+
__printf(1, 4)
struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
unsigned int flags,
@@ -5915,12 +5930,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
va_list args;
va_start(args, max_active);
- wq = __alloc_workqueue(fmt, flags, max_active, args);
+ wq = alloc_workqueue_va(fmt, flags, max_active, args);
va_end(args);
- if (!wq)
- return NULL;
-
- wq_init_lockdep(wq);
return wq;
}
@@ -5932,15 +5943,15 @@ static void devm_workqueue_release(void *res)
}
__printf(2, 5) struct workqueue_struct *
-devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
- int max_active, ...)
+devm_alloc_workqueue_noprof(struct device *dev, const char *fmt,
+ unsigned int flags, int max_active, ...)
{
struct workqueue_struct *wq;
va_list args;
int ret;
va_start(args, max_active);
- wq = alloc_workqueue(fmt, flags, max_active, args);
+ wq = alloc_workqueue_va(fmt, flags, max_active, args);
va_end(args);
if (!wq)
return NULL;
@@ -5951,7 +5962,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
return wq;
}
-EXPORT_SYMBOL_GPL(devm_alloc_workqueue);
+EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof);
#ifdef CONFIG_LOCKDEP
__printf(1, 5)