diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/cgroup.c | 286 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-internal.h | 5 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 13 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 2 | ||||
| -rw-r--r-- | kernel/events/core.c | 70 | ||||
| -rw-r--r-- | kernel/events/internal.h | 1 | ||||
| -rw-r--r-- | kernel/events/ring_buffer.c | 2 | ||||
| -rw-r--r-- | kernel/fork.c | 12 | ||||
| -rw-r--r-- | kernel/futex/requeue.c | 13 | ||||
| -rw-r--r-- | kernel/liveupdate/kexec_handover.c | 21 | ||||
| -rw-r--r-- | kernel/liveupdate/luo_session.c | 15 | ||||
| -rw-r--r-- | kernel/rseq.c | 214 | ||||
| -rw-r--r-- | kernel/sched/core.c | 1 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 437 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 32 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.h | 1 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 2 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 82 | ||||
| -rw-r--r-- | kernel/sched/membarrier.c | 11 | ||||
| -rw-r--r-- | kernel/time/timer_migration.c | 40 | ||||
| -rw-r--r-- | kernel/trace/trace_branch.c | 8 | ||||
| -rw-r--r-- | kernel/trace/trace_probe.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace_probe.h | 4 | ||||
| -rw-r--r-- | kernel/workqueue.c | 29 |
24 files changed, 847 insertions, 460 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 43adc96c7f1a..6152add0c5eb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); +static void cgroup_finish_destroy(struct cgroup *cgrp); +static void kill_css_sync(struct cgroup_subsys_state *css); +static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); -static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (was_populated == cgroup_is_populated(cgrp)) break; + /* + * Subtree just emptied below an offlined cgrp. Fire deferred + * destroy. The transition is one-shot. + */ + if (was_populated && !css_is_online(&cgrp->self)) { + cgroup_get(cgrp); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, + &cgrp->finish_destroy_work)); + } + cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); @@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } +static void cgroup_finish_destroy_work_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work); + + cgroup_lock(); + cgroup_finish_destroy(cgrp); + cgroup_unlock(); + cgroup_put(cgrp); +} + static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); - init_waitqueue_head(&cgrp->dying_populated_waitq); + INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { - kill_css(css); + kill_css_sync(css); + kill_css_finish(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) @@ -3934,33 +3957,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_file_ctx *ctx; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + ssize_t ret = 0; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - cgroup_get(cgrp); - cgroup_kn_unlock(of->kn); + ctx = of->priv; + if (!ctx) { + ret = -ENODEV; + goto out_unlock; + } /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { - cgroup_put(cgrp); - return -EBUSY; + ret = -EBUSY; + goto out_unlock; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { - cgroup_put(cgrp); - return PTR_ERR(new); + ret = PTR_ERR(new); + goto out_unlock; } smp_store_release(&ctx->psi.trigger, new); - cgroup_put(cgrp); + +out_unlock: + cgroup_kn_unlock(of->kn); + if (ret) + return ret; return nbytes; } @@ -5059,10 +5090,12 @@ repeat: task = list_entry(it->task_pos, struct task_struct, cg_list); /* - * Hide tasks that are exiting but not yet removed. Keep zombie - * leaders with live threads visible. + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. */ - if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { @@ -5506,7 +5539,7 @@ static struct cftype cgroup_psi_files[] = { * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. - * Implemented in kill_css(). + * Implemented in kill_css_finish(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be @@ -5716,16 +5749,6 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); - - css->cgroup->nr_dying_subsys[ss->id]++; - /* - * Parent css and cgroup cannot be freed until after the freeing - * of child css, see css_free_rwork_fn(). - */ - while ((css = css->parent)) { - css->nr_descendants--; - css->cgroup->nr_dying_subsys[ss->id]++; - } } /** @@ -5995,7 +6018,7 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initiate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css_finish(). */ static void css_killed_work_fn(struct work_struct *work) { @@ -6028,16 +6051,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref) } /** - * kill_css - destroy a css - * @css: css to destroy + * kill_css_sync - synchronous half of css teardown + * @css: css being killed * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget_online() is guaranteed to fail and when - * the reference count reaches zero, @css will be released. + * See cgroup_destroy_locked(). */ -static void kill_css(struct cgroup_subsys_state *css) +static void kill_css_sync(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) @@ -6057,64 +6079,100 @@ static void kill_css(struct cgroup_subsys_state *css) */ css_clear_dir(css); + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } +} + +/** + * kill_css_finish - deferred half of css teardown + * @css: css being killed + * + * See cgroup_destroy_locked(). + */ +static void kill_css_finish(struct cgroup_subsys_state *css) +{ + lockdep_assert_held(&cgroup_mutex); + + /* + * Skip on re-entry: cgroup_apply_control_disable() may have killed @css + * earlier. cgroup_destroy_locked() can still walk it because + * offline_css() (which NULLs cgrp->subsys[ssid]) runs async. + */ + if (percpu_ref_is_dying(&css->refcnt)) + return; + /* - * Killing would put the base ref, but we need to keep it alive - * until after ->css_offline(). + * Killing would put the base ref, but we need to keep it alive until + * after ->css_offline(). */ css_get(css); /* - * cgroup core guarantees that, by the time ->css_offline() is - * invoked, no new css reference will be given out via - * css_tryget_online(). We can't simply call percpu_ref_kill() and - * proceed to offlining css's because percpu_ref_kill() doesn't - * guarantee that the ref is seen as killed on all CPUs on return. + * cgroup core guarantees that, by the time ->css_offline() is invoked, + * no new css reference will be given out via css_tryget_online(). We + * can't simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen as + * killed on all CPUs on return. * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. + * Use percpu_ref_kill_and_confirm() to get notifications as each css is + * confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** - * cgroup_destroy_locked - the first stage of cgroup destruction + * cgroup_destroy_locked - destroy @cgrp (called on rmdir) * @cgrp: cgroup to be destroyed * - * css's make use of percpu refcnts whose killing latency shouldn't be - * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget_online() won't succeed by the time - * ->css_offline() is invoked. To satisfy all the requirements, - * destruction is implemented in the following two steps. - * - * s1. Verify @cgrp can be destroyed and mark it dying. Remove all - * userland visible parts and start killing the percpu refcnts of - * css's. Set up so that the next stage will be kicked off once all - * the percpu refcnts are confirmed to be killed. - * - * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the - * rest of destruction. Once all cgroup references are gone, the - * cgroup is RCU-freed. - * - * This function implements s1. After this step, @cgrp is gone as far as - * the userland is concerned and a new cgroup with the same name may be - * created. As cgroup doesn't care about the names internally, this - * doesn't cause any problem. + * Tear down @cgrp on behalf of rmdir. Constraints: + * + * - Userspace: rmdir must succeed when cgroup.procs and friends are empty. + * + * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's + * subtree is still doing kernel work. A task hidden from cgroup.procs (past + * exit_signals() with signal->live cleared) can still schedule, allocate, and + * consume resources until its final context switch. Dying descendants in the + * subtree can host such tasks too. + * + * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs. + * + * The destruction runs in three parts: + * + * - This function: synchronous user-visible state teardown plus kill_css_sync() + * on each subsystem css. + * + * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on + * each subsystem css. Fires once @cgrp's subtree is fully drained, either + * inline here or from cgroup_update_populated(). + * + * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> + * ->css_offline() -> release/free. + * + * Return 0 on success, -EBUSY if a userspace-visible task or an online child + * remains. */ static int cgroup_destroy_locked(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; int ssid, ret; lockdep_assert_held(&cgroup_mutex); - /* - * Only migration can raise populated from zero and we're already - * holding cgroup_mutex. - */ - if (cgroup_is_populated(cgrp)) + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + if (task) return -EBUSY; /* @@ -6138,9 +6196,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) link->cset->dead = true; spin_unlock_irq(&css_set_lock); - /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) - kill_css(css); + kill_css_sync(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); @@ -6171,79 +6228,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); + if (!cgroup_is_populated(cgrp)) + cgroup_finish_destroy(cgrp); + return 0; }; /** - * cgroup_drain_dying - wait for dying tasks to leave before rmdir - * @cgrp: the cgroup being removed - * - * cgroup.procs and cgroup.threads use css_task_iter which filters out - * PF_EXITING tasks so that userspace doesn't see tasks that have already been - * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the - * cgroup has non-empty css_sets - is only updated when dying tasks pass through - * cgroup_task_dead() in finish_task_switch(). This creates a window where - * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir - * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no - * tasks. - * - * This function aligns cgroup_has_tasks() with what userspace can observe. If - * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are - * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the - * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. - * - * This function only concerns itself with this cgroup's own dying tasks. - * Whether the cgroup has children is cgroup_destroy_locked()'s problem. - * - * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we - * retry the full check from scratch. + * cgroup_finish_destroy - deferred half of @cgrp destruction + * @cgrp: cgroup whose subtree just became empty * - * Must be called with cgroup_mutex held. + * See cgroup_destroy_locked() for the rationale. */ -static int cgroup_drain_dying(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +static void cgroup_finish_destroy(struct cgroup *cgrp) { - struct css_task_iter it; - struct task_struct *task; - DEFINE_WAIT(wait); + struct cgroup_subsys_state *css; + int ssid; lockdep_assert_held(&cgroup_mutex); -retry: - if (!cgroup_has_tasks(cgrp)) - return 0; - - /* Same iterator as cgroup.threads - if any task is visible, it's busy */ - css_task_iter_start(&cgrp->self, 0, &it); - task = css_task_iter_next(&it); - css_task_iter_end(&it); - if (task) - return -EBUSY; - - /* - * All remaining tasks are PF_EXITING and will pass through - * cgroup_task_dead() shortly. Wait for a kick and retry. - * - * cgroup_has_tasks() can't transition from false to true while we're - * holding cgroup_mutex, but the true to false transition happens - * under css_set_lock (via cgroup_task_dead()). We must retest and - * prepare_to_wait() under css_set_lock. Otherwise, the transition - * can happen between our first test and prepare_to_wait(), and we - * sleep with no one to wake us. - */ - spin_lock_irq(&css_set_lock); - if (!cgroup_has_tasks(cgrp)) { - spin_unlock_irq(&css_set_lock); - return 0; - } - prepare_to_wait(&cgrp->dying_populated_waitq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - schedule(); - finish_wait(&cgrp->dying_populated_waitq, &wait); - mutex_lock(&cgroup_mutex); - goto retry; + for_each_css(css, ssid, cgrp) + kill_css_finish(css); } int cgroup_rmdir(struct kernfs_node *kn) @@ -6255,12 +6260,9 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_drain_dying(cgrp); - if (!ret) { - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); - } + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; @@ -7020,7 +7022,6 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { - struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7034,11 +7035,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); - /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - if (waitqueue_active(&link->cgrp->dying_populated_waitq)) - wake_up(&link->cgrp->dying_populated_waitq); - if (dl_task(tsk)) dec_dl_tasks_cs(tsk); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index fd7d19842ded..bb4e692bea30 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -168,6 +168,11 @@ struct cpuset { int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; + /* + * CPU used for temporary DL bandwidth allocation during attach; + * -1 if no DL bandwidth was allocated in the current attach. + */ + int dl_bw_cpu; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1335e437098e..e3a081a07c6d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -288,6 +288,7 @@ struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .dl_bw_cpu = -1, }; /** @@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) if (!trial) return NULL; + trial->dl_bw_cpu = -1; + /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, @@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; + cs->dl_bw_cpu = -1; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ @@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) reset_migrate_dl_data(cs); goto out_unlock; } + + cs->dl_bw_cpu = cpu; } out_success: @@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); - if (cs->nr_migrate_dl_tasks) { - int cpu = cpumask_any(cs->effective_cpus); + if (cs->dl_bw_cpu >= 0) + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); - dl_bw_free(cpu, cs->sum_migrate_dl_bw); + if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); - } mutex_unlock(&cpuset_mutex); } diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 9967fb25c563..4fdab4cf49e0 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, ret = PTR_ERR(rpool); goto err; } else { - new = rpool->resources[index].usage + 1; + new = (s64)rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d1f8bad7e1c..7935d5663944 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7006,6 +7006,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) } static void perf_pmu_output_stop(struct perf_event *event); +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb); /* * A buffer can be mmap()ed multiple times; either directly through the same @@ -7021,8 +7022,6 @@ static void perf_mmap_close(struct vm_area_struct *vma) mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); bool detach_rest = false; /* FIXIES vs perf_pmu_unregister() */ @@ -7117,11 +7116,7 @@ again: * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ - - atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, - &mmap_user->locked_vm); - atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); - free_uid(mmap_user); + perf_mmap_unaccount(vma, rb); out_put: ring_buffer_put(rb); /* could be last */ @@ -7261,6 +7256,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb) +{ + struct user_struct *user = rb->mmap_user; + + atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked, + &user->locked_vm); + atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm); +} + static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, unsigned long nr_pages) { @@ -7323,8 +7327,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - refcount_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); @@ -7474,16 +7476,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mapped(event, vma->vm_mm); /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). + * Try to map it into the page table. On fail undo the above, + * as the callsite expects full cleanup in this case and + * therefore does not invoke vmops::close(). */ ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + if (likely(!ret)) + return 0; + + /* Error path */ + + /* + * If this is the first mmap(), then event->mmap_count should + * be stable at 1. It is only modified by: + * perf_mmap_{open,close}() and perf_mmap(). + * + * The former are not possible because this mmap() hasn't been + * successful yet, and the latter is serialized by + * event->mmap_mutex which we still hold (note that mmap_lock + * is not strictly sufficient here, because the event fd can + * be passed to another process through trivial means like + * fork(), leading to concurrent mmap() from different mm). + * + * Make sure to remove event->rb before releasing + * event->mmap_mutex, such that any concurrent mmap() will not + * attempt use this failed buffer. + */ + if (refcount_read(&event->mmap_count) == 1) { + /* + * Minimal perf_mmap_close(); there can't be AUX or + * other events on account of this being the first. + */ + mapped = get_mapped(event, event_unmapped); + if (mapped) + mapped(event, vma->vm_mm); + perf_mmap_unaccount(vma, event->rb); + ring_buffer_attach(event, NULL); /* drops last rb->refcount */ + refcount_set(&event->mmap_count, 0); + return ret; + } + + /* + * Otherwise this is an already existing buffer, and there is + * no race vs first exposure, so fall-through and call + * perf_mmap_close(). + */ } + perf_mmap_close(vma); return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d9cc57083091..c03c4f2eea57 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) struct perf_buffer *rb; rb = container_of(rcu_head, struct perf_buffer, rcu_head); + free_uid(rb->mmap_user); rb_free(rb); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3e7de2661417..9fe92161715e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) rb->paused = 1; mutex_init(&rb->aux_mutex); + rb->mmap_user = get_current_user(); + refcount_set(&rb->mmap_count, 1); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) diff --git a/kernel/fork.c b/kernel/fork.c index f1ad69c6dc2d..5f3fdfdb14c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1951,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p) static bool need_futex_hash_allocate_default(u64 clone_flags) { - if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) - return false; - return true; + /* + * Allocate a default futex hash for any sibling that will + * share the parent's mm, except vfork. + */ + return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM; } /* @@ -2380,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cancel_cgroup; - /* - * Allocate a default futex hash for the user process once the first - * thread spawns. - */ if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index d818b4d47f1b..b597cb3d17fc 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -319,8 +319,11 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EINVAL; /* Ensure that this does not race against an early wakeup */ - if (!futex_requeue_pi_prepare(top_waiter, NULL)) + if (!futex_requeue_pi_prepare(top_waiter, NULL)) { + plist_del(&top_waiter->list, &hb1->chain); + futex_hb_waiters_dec(hb1); return -EAGAIN; + } /* * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit @@ -722,10 +725,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /* * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. + * Conditionally unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &hb->chain); - futex_hb_waiters_dec(hb); + if (!plist_node_empty(&q->list)) { + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); + } /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 94762de1fe5f..18509d8082ea 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -762,19 +762,24 @@ int kho_add_subtree(const char *name, void *blob, size_t size) goto out_pack; } - err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, - &phys, sizeof(phys)); - if (err < 0) - goto out_pack; + fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, + &phys, sizeof(phys)); + if (fdt_err < 0) + goto out_del_node; - err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, - &size_u64, sizeof(size_u64)); - if (err < 0) - goto out_pack; + fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, + &size_u64, sizeof(size_u64)); + if (fdt_err < 0) + goto out_del_node; WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob, size, false)); + err = 0; + goto out_pack; + +out_del_node: + fdt_del_node(root_fdt, off); out_pack: fdt_pack(root_fdt); diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index a3327a28fc1f..7a42385dabe2 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -514,11 +514,12 @@ int luo_session_deserialize(void) { struct luo_session_header *sh = &luo_session_global.incoming; static bool is_deserialized; - static int err; + static int saved_err; + int err; /* If has been deserialized, always return the same error code */ if (is_deserialized) - return err; + return saved_err; is_deserialized = true; if (!sh->active) @@ -547,7 +548,8 @@ int luo_session_deserialize(void) pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n", (int)sizeof(sh->ser[i].name), sh->ser[i].name, session); - return PTR_ERR(session); + err = PTR_ERR(session); + goto save_err; } err = luo_session_insert(sh, session); @@ -555,7 +557,7 @@ int luo_session_deserialize(void) pr_warn("Failed to insert session [%s] %pe\n", session->name, ERR_PTR(err)); luo_session_free(session); - return err; + goto save_err; } scoped_guard(mutex, &session->mutex) { @@ -565,7 +567,7 @@ int luo_session_deserialize(void) if (err) { pr_warn("Failed to deserialize files for session [%s] %pe\n", session->name, ERR_PTR(err)); - return err; + goto save_err; } } @@ -574,6 +576,9 @@ int luo_session_deserialize(void) sh->ser = NULL; return 0; +save_err: + saved_err = err; + return err; } int luo_session_serialize(void) diff --git a/kernel/rseq.c b/kernel/rseq.c index 38d3ef540760..e75e3a5e312c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void) } __initcall(rseq_debugfs_init); -static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) -{ - return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); -} - static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { struct rseq __user *urseq = t->rseq.usrptr; @@ -258,14 +253,16 @@ efault: static void rseq_slowpath_update_usr(struct pt_regs *regs) { /* - * Preserve rseq state and user_irq state. The generic entry code - * clears user_irq on the way out, the non-generic entry - * architectures are not having user_irq. + * Preserve has_rseq and user_irq state. The generic entry code clears + * user_irq on the way out, the non-generic entry architectures are not + * setting user_irq. */ - const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; + const struct rseq_event evt_mask = { + .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, + .user_irq = true, + }; struct task_struct *t = current; struct rseq_ids ids; - u32 node_id; bool event; if (unlikely(t->flags & PF_EXITING)) @@ -301,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) if (!event) return; - node_id = cpu_to_node(ids.cpu_id); + ids.node_id = cpu_to_node(ids.cpu_id); - if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + if (unlikely(!rseq_update_usr(t, regs, &ids))) { /* * Clear the errors just in case this might survive magically, but * leave the rest intact. @@ -335,8 +332,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs) void __rseq_signal_deliver(int sig, struct pt_regs *regs) { rseq_stat_inc(rseq_stats.signal); + /* - * Don't update IDs, they are handled on exit to user if + * Don't update IDs yet, they are handled on exit to user if * necessary. The important thing is to abort a critical section of * the interrupted context as after this point the instruction * pointer in @regs points to the signal handler. @@ -349,6 +347,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs) current->rseq.event.error = 0; force_sigsegv(sig); } + + /* + * In legacy mode, force the update of IDs before returning to user + * space to stay compatible. + */ + if (!rseq_v2(current)) + rseq_force_update(); } /* @@ -384,19 +389,22 @@ void rseq_syscall(struct pt_regs *regs) static bool rseq_reset_ids(void) { - struct rseq_ids ids = { - .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, - .mm_cid = 0, - }; + struct rseq __user *rseq = current->rseq.usrptr; /* * If this fails, terminate it because this leaves the kernel in * stupid state as exit to user space will try to fixup the ids * again. */ - if (rseq_set_ids(current, &ids, 0)) - return true; + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(0, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0, &rseq->node_id, efault); + unsafe_put_user(0, &rseq->mm_cid, efault); + } + return true; +efault: force_sig(SIGSEGV); return false; } @@ -404,70 +412,29 @@ static bool rseq_reset_ids(void) /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 -/* - * sys_rseq - setup restartable sequences for caller thread. - */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) { u32 rseqfl = 0; + u8 version = 1; - if (flags & RSEQ_FLAG_UNREGISTER) { - if (flags & ~RSEQ_FLAG_UNREGISTER) - return -EINVAL; - /* Unregister rseq for current thread. */ - if (current->rseq.usrptr != rseq || !current->rseq.usrptr) - return -EINVAL; - if (rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - if (!rseq_reset_ids()) - return -EFAULT; - rseq_reset(current); - return 0; - } - - if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) - return -EINVAL; - - if (current->rseq.usrptr) { - /* - * If rseq is already registered, check whether - * the provided address differs from the prior - * one. - */ - if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - /* Already registered. */ - return -EBUSY; - } - - /* - * If there was no rseq previously registered, ensure the provided rseq - * is properly aligned, as communcated to user-space through the ELF - * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq - * size, the required alignment is the original struct rseq alignment. - * - * The rseq_len is required to be greater or equal to the original rseq - * size. In order to be valid, rseq_len is either the original rseq size, - * or large enough to contain all supported fields, as communicated to - * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. - */ - if (rseq_len < ORIG_RSEQ_SIZE || - (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || - rseq_len < offsetof(struct rseq, end)))) - return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; - if (rseq_slice_extension_enabled() && - (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + /* + * Architectures, which use the generic IRQ entry code (at least) enable + * registrations with a size greater than the original v1 fixed sized + * @rseq_len, which has been validated already to utilize the optimized + * v2 ABI mode which also enables extended RSEQ features beyond MMCID. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) + version = 2; + + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { + if (rseq_slice_extension_enabled()) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } } scoped_user_write_access(rseq, efault) { @@ -485,7 +452,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + + /* + * All fields past mm_cid are only valid for non-legacy v2 + * registrations. + */ + if (version > 1) { + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } /* @@ -501,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 #endif /* - * If rseq was previously inactive, and has just been - * registered, ensure the cpu_id_start and cpu_id fields - * are updated before returning to user-space. + * Ensure the cpu_id_start and cpu_id fields are updated before + * returning to user-space. */ - current->rseq.event.has_rseq = true; + current->rseq.event.has_rseq = version; rseq_force_update(); return 0; @@ -513,6 +487,80 @@ efault: return -EFAULT; } +static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) +{ + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) + return -EINVAL; + if (rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); + return 0; +} + +static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) +{ + /* + * If rseq is already registered, check whether the provided address + * differs from the prior one. + */ + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; +} + +static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) +{ + /* + * Ensure the provided rseq is properly aligned, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If + * rseq_len is the original rseq size, the required alignment is the + * original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. + */ + if (rseq_len < ORIG_RSEQ_SIZE) + return false; + + if (rseq_len == ORIG_RSEQ_SIZE) + return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); + + return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && + rseq_len >= offsetof(struct rseq, end); +} + +#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + +/* + * sys_rseq - Register or unregister restartable sequences for the caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +{ + if (flags & RSEQ_FLAG_UNREGISTER) + return rseq_unregister(rseq, rseq_len, flags, sig); + + if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) + return -EINVAL; + + if (current->rseq.usrptr) + return rseq_reregister(rseq, rseq_len, sig); + + if (!rseq_length_valid(rseq, rseq_len)) + return -EINVAL; + + return rseq_register(rseq, rseq_len, flags, sig); +} + #ifdef CONFIG_RSEQ_SLICE_EXTENSION struct slice_timer { struct hrtimer timer; @@ -713,6 +761,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) return -ENOTSUPP; if (!current->rseq.usrptr) return -ENXIO; + if (!rseq_v2(current)) + return -ENOTSUPP; /* No change? */ if (enable == !!current->rseq.slice.state.enabled) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da20fb6ea25a..b8871449d3c6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4458,6 +4458,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; + p->se.rel_deadline = 0; INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e426e27b6794..38d90baf78cf 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = { .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), .head_offset = offsetof(struct scx_sched, hash_node), + .insecure_elasticity = true, /* inserted under scx_sched_lock */ }; static struct rhashtable scx_sched_hash; @@ -52,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); static DEFINE_RAW_SPINLOCK(scx_bypass_lock); -static cpumask_var_t scx_bypass_lb_donee_cpumask; -static cpumask_var_t scx_bypass_lb_resched_cpumask; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -469,24 +468,35 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, op, rq, args...) \ +/* + * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not + * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. + */ +#define SCX_CALL_OP(sch, op, locked_rq, args...) \ do { \ - if (rq) \ - update_locked_rq(rq); \ + struct rq *__prev_locked_rq; \ + \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ } while (0) -#define SCX_CALL_OP_RET(sch, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ + struct rq *__prev_locked_rq; \ __typeof__((sch)->ops.op(args)) __ret; \ \ - if (rq) \ - update_locked_rq(rq); \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ __ret = (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ __ret; \ }) @@ -498,39 +508,39 @@ do { \ * those subject tasks. * * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - - * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock - * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if - * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's + * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. + * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. * * kf_tasks[] can not stack, so task-based SCX ops must not nest. The * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ do { \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -756,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); return; } #endif @@ -856,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) iter->css_pos = css_next_descendant_pre(iter->css_pos, &iter->cgrp->self); if (iter->css_pos) - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); } return NULL; } @@ -916,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * * Test for idle_sched_class as only init_tasks are on it. */ - if (p->sched_class != &idle_sched_class) - break; - } - if (!p) - return NULL; + if (p->sched_class == &idle_sched_class) + continue; - iter->rq = task_rq_lock(p, &iter->rf); - iter->locked_task = p; + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; - return p; + /* + * cgroup_task_dead() removes the dead tasks from cset->tasks + * after sched_ext_dead() and cgroup iteration may see tasks + * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS + * is set by sched_ext_dead() under @p's rq lock. Test it to + * avoid visiting tasks which are already dead from SCX POV. + */ + if (p->scx.flags & SCX_TASK_OFF_TASKS) { + __scx_task_iter_rq_unlock(iter); + continue; + } + + return p; + } + return NULL; } /** @@ -1388,18 +1411,55 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, p->scx.flags &= ~SCX_TASK_IN_CUSTODY; } -static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - call_task_dequeue(scx_root, rq, p, 0); + call_task_dequeue(sch, rq, p, 0); + + /* + * Note that @rq's lock may be dropped between this enqueue and @p + * actually getting on CPU. This gives higher-class tasks (e.g. RT) + * an opportunity to wake up on @rq and prevent @p from running. + * Here are some concrete examples: + * + * Example 1: + * + * We dispatch two tasks from a single ops.dispatch(): + * - First, a local task to this CPU's local DSQ; + * - Second, a local/remote task to a remote CPU's local DSQ. + * We must drop the local rq lock in order to finish the second + * dispatch. In that time, an RT task can wake up on the local rq. + * + * Example 2: + * + * We dispatch a local/remote task to a remote CPU's local DSQ. + * We must drop the remote rq lock before the dispatched task can run, + * which gives an RT task an opportunity to wake up on the remote rq. + * + * Both examples work the same if we replace dispatching with moving + * the tasks from a user-created DSQ. + * + * We must detect these wakeups so that we can re-enqueue IMMED tasks + * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this + * purpose, but for it to be invoked, we must ensure that we bump + * @rq->next_class to &ext_sched_class if it's currently idle. + * + * wakeup_preempt() does the bumping, and since we only invoke it if + * @rq->next_class is below &ext_sched_class, it will also + * resched_curr(rq). + */ + if (sched_class_above(p->sched_class, rq->next_class)) + wakeup_preempt(rq, p, 0); /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving * @p into its local DSQ. + * Note that the wakeup_preempt() above may have already triggered + * a resched if @rq->next_class was idle. It's harmless, since + * need_resched is cleared immediately after task pick. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) return; @@ -1407,11 +1467,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && rq->curr->sched_class == &ext_sched_class) { rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) resched_curr(rq); + } } static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, @@ -1494,11 +1551,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } else { - bool was_empty; - - was_empty = list_empty(&dsq->list); + /* + * dsq->list can contain parked BPF iterator cursors, so + * list_empty() here isn't a reliable proxy for "no real + * task in the DSQ". Test dsq->first_task directly. + */ list_add_tail(&p->scx.dsq_list.node, &dsq->list); - if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } } @@ -1518,7 +1577,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, * concurrently in a non-atomic way. */ if (is_local) { - local_dsq_post_enq(dsq, p, enq_flags); + local_dsq_post_enq(sch, dsq, p, enq_flags); } else { /* * Task on global/bypass DSQ: leave custody, task on @@ -2129,7 +2188,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl schedule_reenq_local(rq, 0); } -static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, +static void move_local_task_to_local_dsq(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) { @@ -2149,7 +2209,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; - local_dsq_post_enq(dst_dsq, p, enq_flags); + local_dsq_post_enq(sch, dst_dsq, p, enq_flags); } /** @@ -2370,7 +2430,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, /* @p is going from a non-local DSQ to a local DSQ */ if (src_rq == dst_rq) { task_unlink_from_dsq(p, src_dsq); - move_local_task_to_local_dsq(p, enq_flags, + move_local_task_to_local_dsq(sch, p, enq_flags, src_dsq, dst_rq); raw_spin_unlock(&src_dsq->lock); } else { @@ -2423,7 +2483,7 @@ retry: if (rq == task_rq) { task_unlink_from_dsq(p, dsq); - move_local_task_to_local_dsq(p, enq_flags, dsq, rq); + move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); raw_spin_unlock(&dsq->lock); return true; } @@ -3183,7 +3243,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && !scx_bypassing(sch_a, task_cpu(a))) return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, - NULL, + task_rq(a), (struct task_struct *)a, (struct task_struct *)b); else @@ -3631,6 +3691,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch, SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); } +/* + * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never + * ran. The task state has not been transitioned, so this mirrors the + * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). + */ +static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) +{ + struct scx_exit_task_args args = { .cancelled = true }; + + lockdep_assert_held(&p->pi_lock); + lockdep_assert_rq_held(task_rq(p)); + + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); +} + static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { @@ -3639,11 +3715,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, /* * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and - * its parent. Disable and exit for the child too. + * its parent. Exit for the child too - scx_enable_task() never ran for + * it, so undo only init_task. */ - if ((p->scx.flags & SCX_TASK_SUB_INIT) && - !WARN_ON_ONCE(!scx_enabling_sub_sched)) { - __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + if (p->scx.flags & SCX_TASK_SUB_INIT) { + if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) + scx_sub_init_cancel_task(scx_enabling_sub_sched, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } @@ -3784,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p) /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. + * + * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup + * iteration is only used from sub-sched paths, which require root + * enabled. Root enable transitions every live task to at least READY. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; @@ -3791,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); + p->scx.flags |= SCX_TASK_OFF_TASKS; task_rq_unlock(rq, p, &rf); } } @@ -4324,9 +4407,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) void scx_group_set_weight(struct task_group *tg, unsigned long weight) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4339,9 +4423,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); @@ -4355,9 +4440,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle) void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4380,21 +4466,6 @@ static struct cgroup *root_cgroup(void) return &cgrp_dfl_root.cgrp; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) -{ - return sch->cgrp; -} - -/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) -{ - struct cgroup *pos; - struct cgroup_subsys_state *css; - - cgroup_for_each_live_descendant_pre(pos, css, cgrp) - rcu_assign_pointer(pos->scx_sched, sch); -} - static void scx_cgroup_lock(void) { #ifdef CONFIG_EXT_GROUP_SCHED @@ -4412,12 +4483,30 @@ static void scx_cgroup_unlock(void) } #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ static struct cgroup *root_cgroup(void) { return NULL; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +#ifdef CONFIG_EXT_SUB_SCHED +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} +#endif /* CONFIG_EXT_SUB_SCHED */ + /* * Omitted operations: * @@ -4712,6 +4801,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->disable_irq_work); kthread_destroy_worker(sch->helper); timer_shutdown_sync(&sch->bypass_lb_timer); + free_cpumask_var(sch->bypass_lb_donee_cpumask); + free_cpumask_var(sch->bypass_lb_resched_cpumask); #ifdef CONFIG_EXT_SUB_SCHED kfree(sch->cgrp_path); @@ -4938,6 +5029,25 @@ void scx_softlockup(u32 dur_s) smp_processor_id(), dur_s); } +/* + * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), + * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing + * it from NMI context can lead to deadlocks. Defer via irq_work; the + * disable path runs off irq_work anyway. + */ +static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); + +static void scx_hardlockup_irq_workfn(struct irq_work *work) +{ + int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); + + if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); +} + +static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); + /** * scx_hardlockup - sched_ext hardlockup handler * @@ -4946,17 +5056,19 @@ void scx_softlockup(u32 dur_s) * Try kicking out the current scheduler in an attempt to recover the system to * a good state before taking more drastic actions. * - * Returns %true if sched_ext is enabled and abort was initiated, which may - * resolve the reported hardlockup. %false if sched_ext is not enabled or - * someone else already initiated abort. + * Queues an irq_work; the handle_lockup() call happens in IRQ context (see + * scx_hardlockup_irq_workfn). + * + * Returns %true if sched_ext is enabled and the work was queued, %false + * otherwise. */ bool scx_hardlockup(int cpu) { - if (!handle_lockup("hard lockup - CPU %d", cpu)) + if (!rcu_access_pointer(scx_root)) return false; - printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", - cpu); + atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); + irq_work_queue(&scx_hardlockup_irq_work); return true; } @@ -5000,6 +5112,15 @@ resume: if (cpumask_empty(donee_mask)) break; + /* + * If an earlier pass placed @p on @donor_dsq from a different + * CPU and the donee hasn't consumed it yet, @p is still on the + * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved + * without its rq locked. Skip. + */ + if (task_rq(p) != donor_rq) + continue; + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); if (donee >= nr_cpu_ids) continue; @@ -5058,8 +5179,8 @@ resume: static void bypass_lb_node(struct scx_sched *sch, int node) { const struct cpumask *node_mask = cpumask_of_node(node); - struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; - struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; + struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; u32 nr_target, nr_donor_target; u32 before_min = U32_MAX, before_max = 0; @@ -5698,6 +5819,8 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); + if (sch->sub_kset) + kset_unregister(sch->sub_kset); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5829,6 +5952,10 @@ static void scx_root_disable(struct scx_sched *sch) * could observe an object of the same name still in the hierarchy when * the next scheduler is loaded. */ +#ifdef CONFIG_EXT_SUB_SCHED + if (sch->sub_kset) + kset_unregister(sch->sub_kset); +#endif kobject_del(&sch->kobj); free_kick_syncs(); @@ -5921,6 +6048,25 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) irq_work_queue(&sch->disable_irq_work); } +/** + * scx_flush_disable_work - flush the disable work and wait for it to finish + * @sch: the scheduler + * + * sch->disable_work might still not queued, causing kthread_flush_work() + * as a noop. Syncing the irq_work first is required to guarantee the + * kthread work has been queued before waiting for it. + */ +static void scx_flush_disable_work(struct scx_sched *sch) +{ + int kind; + + do { + irq_work_sync(&sch->disable_irq_work); + kthread_flush_work(&sch->disable_work); + kind = atomic_read(&sch->exit_kind); + } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); +} + static void dump_newline(struct seq_buf *s) { trace_sched_ext_dump(""); @@ -6032,9 +6178,8 @@ static void ops_dump_exit(void) scx_dump_data.cpu = -1; } -static void scx_dump_task(struct scx_sched *sch, - struct seq_buf *s, struct scx_dump_ctx *dctx, - struct task_struct *p, char marker) +static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, + struct rq *rq, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; struct scx_sched *task_sch = scx_task_sched(p); @@ -6075,7 +6220,7 @@ static void scx_dump_task(struct scx_sched *sch, if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(sch, dump_task, NULL, dctx, p); + SCX_CALL_OP(sch, dump_task, rq, dctx, p); ops_dump_exit(); } @@ -6199,8 +6344,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, dump_cpu, NULL, - &dctx, cpu, idle); + SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); ops_dump_exit(); } @@ -6223,11 +6367,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, if (rq->curr->sched_class == &ext_sched_class && (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) - scx_dump_task(sch, &s, &dctx, rq->curr, '*'); + scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) if (dump_all_tasks || scx_task_on_sched(sch, p)) - scx_dump_task(sch, &s, &dctx, p, ' '); + scx_dump_task(sch, &s, &dctx, rq, p, ' '); next: rq_unlock_irqrestore(rq, &rf); } @@ -6437,6 +6581,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); + + if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_stop_helper; + } + if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_lb_cpumask; + } sch->ops = *ops; rcu_assign_pointer(ops->priv, sch); @@ -6446,14 +6599,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, char *buf = kzalloc(PATH_MAX, GFP_KERNEL); if (!buf) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } cgroup_path(cgrp, buf, PATH_MAX); sch->cgrp_path = kstrdup(buf, GFP_KERNEL); kfree(buf); if (!sch->cgrp_path) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } sch->cgrp = cgrp; @@ -6488,10 +6641,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #endif /* CONFIG_EXT_SUB_SCHED */ return sch; -#ifdef CONFIG_EXT_SUB_SCHED +err_free_lb_resched: + free_cpumask_var(sch->bypass_lb_resched_cpumask); +err_free_lb_cpumask: + free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: kthread_destroy_worker(sch->helper); -#endif err_free_pcpu: for_each_possible_cpu(cpu) { if (cpu == bypass_fail_cpu) @@ -6510,7 +6665,7 @@ err_free_ei: err_free_sch: kfree(sch); err_put_cgrp: -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_put(cgrp); #endif return ERR_PTR(ret); @@ -6601,7 +6756,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_get(cgrp); #endif sch = scx_alloc_and_add_sched(ops, cgrp, NULL); @@ -6639,8 +6794,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) rcu_assign_pointer(scx_root, sch); ret = scx_link_sched(sch); - if (ret) + if (ret) { + cpus_read_unlock(); goto err_disable; + } scx_idle_enable(ops); @@ -6821,7 +6978,7 @@ err_disable: * completion. sch's base reference will be put by bpf_scx_unreg(). */ scx_error(sch, "scx_root_enable() failed (%d)", ret); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7072,23 +7229,30 @@ out_unlock: abort: put_task_struct(p); scx_task_iter_stop(&sti); - scx_enabling_sub_sched = NULL; + /* + * Undo __scx_init_task() for tasks we marked. scx_enable_task() never + * ran for @sch on them, so calling scx_disable_task() here would invoke + * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched + * must stay set until SUB_INIT is cleared from every marked task - + * scx_disable_and_exit_task() reads it when a task exits concurrently. + */ scx_task_iter_start(&sti, sch->cgrp); while ((p = scx_task_iter_next_locked(&sti))) { if (p->scx.flags & SCX_TASK_SUB_INIT) { - __scx_disable_and_exit_task(sch, p); + scx_sub_init_cancel_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } } scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; err_unlock_and_disable: /* we'll soon enter disable path, keep bypass on */ scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); err_disable: mutex_unlock(&scx_enable_mutex); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7349,7 +7513,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link) struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); scx_disable(sch, SCX_EXIT_UNREG); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); } @@ -8033,12 +8197,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; - struct scx_sched *sch = src_dsq->sched; + struct scx_sched *sch; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; + /* + * The verifier considers an iterator slot initialized on any + * KF_ITER_NEW return, so a BPF program may legally reach here after + * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. + */ + if (unlikely(!src_dsq)) + return false; + + sch = src_dsq->sched; + if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) return false; @@ -8526,7 +8700,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.slice = slice; @@ -8549,7 +8723,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.dsq_vtime = vtime; @@ -8633,11 +8807,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the number of tasks in the DSQ matching @dsq_id. If not found, * -%ENOENT is returned. */ -__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dispatch_q *dsq; @@ -8645,7 +8820,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) preempt_disable(); - sch = rcu_dereference_sched(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) { ret = -ENODEV; goto out; @@ -8677,21 +8852,21 @@ out: /** * scx_bpf_destroy_dsq - Destroy a custom DSQ * @dsq_id: DSQ to destroy + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is * empty and no further tasks are dispatched to it. Ignored if called on a DSQ * which doesn't exist. Can be called from any online scx_ops operations. */ -__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; - rcu_read_lock(); - sch = rcu_dereference(scx_root); + guard(rcu)(); + sch = scx_prog_sched(aux); if (sch) destroy_dsq(sch, dsq_id); - rcu_read_unlock(); } /** @@ -9445,8 +9620,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) @@ -9479,6 +9654,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_any, + .filter = scx_kfunc_context_filter, }; /* @@ -9526,13 +9702,12 @@ static const u32 scx_kf_allow_flags[] = { }; /* - * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the - * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this - * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or + * Verifier-time filter for SCX kfuncs. Registered via the .filter field on + * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc + * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the - * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g. - * scx_kfunc_ids_any) by falling through to "allow" when none of the - * context-sensitive sets contain the kfunc. + * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by + * falling through to "allow" when none of the SCX sets contain the kfunc. */ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) { @@ -9541,18 +9716,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); + bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); + bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); u32 moff, flags; - /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */ - if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release)) + /* Not an SCX kfunc - allow. */ + if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || + in_cpu_release || in_idle || in_any)) return 0; /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ if (prog->type == BPF_PROG_TYPE_SYSCALL) - return (in_unlocked || in_select_cpu) ? 0 : -EACCES; + return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) - return -EACCES; + return (in_any || in_idle) ? 0 : -EACCES; /* * add_subprog_and_kfunc() collects all kfunc calls, including dead code @@ -9565,14 +9743,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) return 0; /* - * Non-SCX struct_ops: only unlocked kfuncs are safe. The other - * context-sensitive kfuncs assume the rq lock is held by the SCX - * dispatch path, which doesn't apply to other struct_ops users. + * Non-SCX struct_ops: SCX kfuncs are not permitted. */ if (prog->aux->st_ops != &bpf_sched_ext_ops) - return in_unlocked ? 0 : -EACCES; + return -EACCES; /* SCX struct_ops: check the per-op allow list. */ + if (in_any || in_idle) + return 0; + moff = prog->aux->attach_st_ops_member_off; flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; @@ -9656,12 +9835,6 @@ static int __init scx_init(void) return ret; } - if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || - !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { - pr_err("sched_ext: Failed to allocate cpumasks\n"); - return -ENOMEM; - } - return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 443d12a3df67..6e1980763270 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -466,12 +466,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, preempt_disable(); /* - * Check whether @prev_cpu is still within the allowed set. If not, - * we can still try selecting a nearby CPU. - */ - is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); - - /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ if (allowed != p->cpus_ptr) { @@ -488,6 +482,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, } /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* * This is necessary to protect llc_cpus. */ rcu_read_lock(); @@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq * lock or @p's pi_lock. Three cases: * - * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock. + * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up + * task's pi_lock; the wake-up task is recorded in kf_tasks[0] + * by SCX_CALL_OP_TASK_RET(). * - other rq-locked SCX op: scx_locked_rq() points at the held rq. * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops): * nothing held, take pi_lock ourselves. + * + * In the first two cases, BPF schedulers may pass an arbitrary task + * that the held lock doesn't cover. Refuse those. */ if (this_rq()->scx.in_select_cpu) { + if (!scx_kf_arg_task_ok(sch, p)) + return -EINVAL; lockdep_assert_held(&p->pi_lock); - } else if (!scx_locked_rq()) { + } else if (scx_locked_rq()) { + if (task_rq(p) != scx_locked_rq()) + goto cross_task; + } else { raw_spin_lock_irqsave(&p->pi_lock, irq_flags); we_locked = true; } @@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); return cpu; + +cross_task: + scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]", + p->comm, p->pid); + return -EINVAL; } /** @@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_idle, + .filter = scx_kfunc_context_filter, }; /* diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index dc35f850481e..8d169d3bbdf9 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,6 +12,7 @@ struct sched_ext_ops; +extern struct btf_id_set8 scx_kfunc_ids_idle; extern struct btf_id_set8 scx_kfunc_ids_select_cpu; void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 62ce4eaf6a3f..a075732d4430 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1075,6 +1075,8 @@ struct scx_sched { struct irq_work disable_irq_work; struct kthread_work disable_work; struct timer_list bypass_lb_timer; + cpumask_var_t bypass_lb_donee_cpumask; + cpumask_var_t bypass_lb_resched_cpumask; struct rcu_work rcu_work; /* all ancestors including self */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69361c63353a..3ebec186f982 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -847,13 +847,19 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO * is set. * - * Return true if the lag has been adjusted. + * Return true if the vlag has been modified. Specifically: + * + * se->vlag != avg_vruntime() - se->vruntime + * + * This can be due to clamping in entity_lag() or clamping due to + * sched_delayed. Either way, when vlag is modified and the entity is + * retained, the tree needs to be adjusted. */ static __always_inline bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); - bool ret; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag = entity_lag(cfs_rq, se, avruntime); WARN_ON_ONCE(!se->on_rq); @@ -863,10 +869,9 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) if (sched_feat(DELAY_ZERO)) vlag = min(vlag, 0); } - ret = (vlag == se->vlag); se->vlag = vlag; - return ret; + return avruntime - vlag != se->vruntime; } /* @@ -877,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> V >= v_i * - * \Sum (v_i - v)*w_i - * V = ------------------ + v + * \Sum (v_i - v0)*w_i + * V = ------------------- + v0 * \Sum w_i * - * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. @@ -889,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; + s64 key, avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -899,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; + key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); + + /* + * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD' + * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24 + * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise. + * + * This gives that on 64bit the product will be at least 64bit which + * overflows s64, while on 32bit it will only be 44bits and should fit + * comfortably. + */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_SUPPORTS_INT128 + /* This often results in simpler code than __builtin_mul_overflow(). */ + return avg >= (__int128)key * load; +#else + s64 rhs; + /* + * On overflow, the sign of key tells us the correct answer: a large + * positive key means vruntime >> V, so not eligible; a large negative + * key means vruntime << V, so eligible. + */ + if (check_mul_overflow(key, load, &rhs)) + return key <= 0; + + return avg >= rhs; +#endif +#else /* 32bit */ + return avg >= key * load; +#endif } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1099,7 +1133,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -1170,11 +1204,6 @@ found: return best; } -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - return __pick_eevdf(cfs_rq, true); -} - struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -5749,11 +5778,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect) { struct sched_entity *se; - se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq, protect); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* @@ -9027,7 +9056,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; - struct sched_entity *se = &donor->se, *pse = &p->se; + struct sched_entity *nse, *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; @@ -9138,11 +9167,18 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f } pick: + nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT); + /* If @p has become the most eligible task, force preemption */ + if (nse == pse) + goto preempt; + /* - * If @p has become the most eligible task, force preemption. + * Because p is enqueued, nse being null can only mean that we + * dequeued a delayed task. If there are still entities queued in + * cfs, check if the next one will be p. */ - if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) - goto preempt; + if (!nse && cfs_rq->nr_queued) + goto pick; if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); @@ -9179,7 +9215,7 @@ again: throttled |= check_cfs_rq_runtime(cfs_rq); - se = pick_next_entity(rq, cfs_rq); + se = pick_next_entity(rq, cfs_rq, true); if (!se) goto again; cfs_rq = group_cfs_rq(se); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 623445603725..226a6329f3e9 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,16 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_sched_switch_event(current); + /* + * Legacy mode requires that IDs are written and the critical section is + * evaluated. V2 optimized mode handles the critical section and IDs are + * only updated if they change as a consequence of preemption after + * return from this IPI. + */ + if (rseq_v2(current)) + rseq_sched_switch_event(current); + else + rseq_force_update(); } static void ipi_sync_rq_state(void *info) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 155eeaea4113..1d0d3a4058d5 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1860,19 +1860,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, * child to the new parents. So tmigr_active_up() activates the * new parents while walking up from the old root to the new. * - * * It is ensured that @start is active, as this setup path is - * executed in hotplug prepare callback. This is executed by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. + * * It is ensured that @start is active, (or on the way to be activated + * by another CPU that woke up before the current one) as this setup path + * is executed in hotplug prepare callback. This is executed by an already + * connected and !idle CPU in the hierarchy. + * + * * The below RmW atomic operation ensures that: + * + * 1) If the old root has been completely activated, the latest state is + * acquired (the below implicit acquire pairs with the implicit release + * from cmpxchg() in tmigr_active_up()). + * + * 2) If the old root is still on the way to be activated, the lagging behind + * CPU performing the activation will acquire the links up to the new root. + * (The below implicit release pairs with the implicit acquire from cmpxchg() + * in tmigr_active_up()). + * + * 3) Every subsequent CPU below the old root will acquire the new links while + * walking through the old root (The below implicit release pairs with the + * implicit acquire from cmpxchg() in either tmigr_active_up()) or + * tmigr_inactive_up(). */ - state.state = atomic_read(&start->migr_state); - WARN_ON_ONCE(!state.active); + state.state = atomic_fetch_or(0, &start->migr_state); WARN_ON_ONCE(!start->parent); - data.childmask = start->groupmask; - __walk_groups_from(tmigr_active_up, &data, start, start->parent); + /* + * If the state of the old root is inactive, another CPU is on its way to activate + * it and propagate to the new root. + */ + if (state.active) { + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } } /* Root update */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6809b370e991..d1564db95a8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&annotated_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "annotated branches stats\n"); - return 1; + return ret; } return 0; } @@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&all_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "all branches stats\n"); - return 1; + return ret; } return 0; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e1c73065dae5..e0d3a0da26af 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1523,6 +1523,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + if (*size > MAX_PROBE_EVENT_SIZE) { + ret = -E2BIG; + trace_probe_log_err(ctx->offset, EVENT_TOO_BIG); + goto fail; + } + if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 9fc56c937130..262d8707a3df 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX +#define MAX_PROBE_EVENT_SIZE 3072 /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_TYPE4STR, "This type does not fit for string."),\ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ C(TOO_MANY_ARGS, "Too many arguments are specified"), \ - C(TOO_MANY_EARGS, "Too many entry arguments specified"), + C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..3d2e3b2ec528 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5906,6 +5906,21 @@ err_destroy: return NULL; } +__printf(1, 0) +static struct workqueue_struct *alloc_workqueue_va(const char *fmt, + unsigned int flags, + int max_active, + va_list args) +{ + struct workqueue_struct *wq; + + wq = __alloc_workqueue(fmt, flags, max_active, args); + if (wq) + wq_init_lockdep(wq); + + return wq; +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, @@ -5915,12 +5930,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, va_list args; va_start(args, max_active); - wq = __alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); - if (!wq) - return NULL; - - wq_init_lockdep(wq); return wq; } @@ -5932,15 +5943,15 @@ static void devm_workqueue_release(void *res) } __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...) +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; int ret; va_start(args, max_active); - wq = alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; @@ -5951,7 +5962,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, return wq; } -EXPORT_SYMBOL_GPL(devm_alloc_workqueue); +EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) |
