diff options
| author | Alexei Starovoitov <ast@kernel.org> | 2026-05-10 13:24:49 -0700 |
|---|---|---|
| committer | Alexei Starovoitov <ast@kernel.org> | 2026-05-10 13:24:49 -0700 |
| commit | 7e033543a2ab4c72319201298ed458e3bbddd82f (patch) | |
| tree | d6c2f6517b65fdf17dd647ed9df1666f28b0ba35 /kernel | |
| parent | 2ca6723a5f7b68c739dba47b2639e3eaa7884b09 (diff) | |
| parent | afaa0a477099cb7256e26fe11289c753a225ac97 (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf 7.1-rc3
Cross-merge BPF and other fixes after downstream PR.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
45 files changed, 1645 insertions, 952 deletions
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 802656c6fd3c..49a8f7b1beef 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -511,7 +511,7 @@ static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 { struct bpf_arena *arena = container_of(map, struct bpf_arena, map); - if ((u64)off > arena->user_vm_end - arena->user_vm_start) + if ((u64)off >= arena->user_vm_end - arena->user_vm_start) return -ERANGE; *imm = (unsigned long)arena->user_vm_start; return 0; diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 332e6e003f27..58197d73b120 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -1914,26 +1914,15 @@ int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env) return -ENOMEM; } - instance = call_instance(env, NULL, 0, 0); - if (IS_ERR(instance)) { - err = PTR_ERR(instance); - goto out; - } - err = analyze_subprog(env, NULL, info, instance, callsites); - if (err) - goto out; - /* - * Subprogs and callbacks that don't receive FP-derived arguments - * cannot access ancestor stack frames, so they were skipped during - * the recursive walk above. Async callbacks (timer, workqueue) are - * also not reachable from the main program's call graph. Analyze - * all unvisited subprogs as independent roots at depth 0. + * Analyze every subprog in reverse topological order (callers + * before callees) so that each subprog is analyzed before its + * callees, allowing the recursive walk inside analyze_subprog() + * to naturally reach callees that receive FP-derived args. * - * Use reverse topological order (callers before callees) so that - * each subprog is analyzed before its callees, allowing the - * recursive walk inside analyze_subprog() to naturally - * reach nested callees that also lack FP-derived args. + * Subprogs and callbacks that don't receive FP-derived arguments + * cannot access ancestor stack frames are analyzed independently. + * Async callbacks (timer, workqueue) are handled the same way. */ for (k = env->subprog_cnt - 1; k >= 0; k--) { int sub = env->subprog_topo_order[k]; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1f084ee71443..6152add0c5eb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); +static void cgroup_finish_destroy(struct cgroup *cgrp); +static void kill_css_sync(struct cgroup_subsys_state *css); +static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); -static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (was_populated == cgroup_is_populated(cgrp)) break; + /* + * Subtree just emptied below an offlined cgrp. Fire deferred + * destroy. The transition is one-shot. + */ + if (was_populated && !css_is_online(&cgrp->self)) { + cgroup_get(cgrp); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, + &cgrp->finish_destroy_work)); + } + cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); @@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } +static void cgroup_finish_destroy_work_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work); + + cgroup_lock(); + cgroup_finish_destroy(cgrp); + cgroup_unlock(); + cgroup_put(cgrp); +} + static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); - init_waitqueue_head(&cgrp->dying_populated_waitq); + INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { - kill_css(css); + kill_css_sync(css); + kill_css_finish(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) @@ -3934,33 +3957,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_file_ctx *ctx; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + ssize_t ret = 0; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - cgroup_get(cgrp); - cgroup_kn_unlock(of->kn); + ctx = of->priv; + if (!ctx) { + ret = -ENODEV; + goto out_unlock; + } /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { - cgroup_put(cgrp); - return -EBUSY; + ret = -EBUSY; + goto out_unlock; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { - cgroup_put(cgrp); - return PTR_ERR(new); + ret = PTR_ERR(new); + goto out_unlock; } smp_store_release(&ctx->psi.trigger, new); - cgroup_put(cgrp); + +out_unlock: + cgroup_kn_unlock(of->kn); + if (ret) + return ret; return nbytes; } @@ -5059,10 +5090,12 @@ repeat: task = list_entry(it->task_pos, struct task_struct, cg_list); /* - * Hide tasks that are exiting but not yet removed. Keep zombie - * leaders with live threads visible. + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. */ - if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { @@ -5506,7 +5539,7 @@ static struct cftype cgroup_psi_files[] = { * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. - * Implemented in kill_css(). + * Implemented in kill_css_finish(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be @@ -5716,16 +5749,6 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); - - css->cgroup->nr_dying_subsys[ss->id]++; - /* - * Parent css and cgroup cannot be freed until after the freeing - * of child css, see css_free_rwork_fn(). - */ - while ((css = css->parent)) { - css->nr_descendants--; - css->cgroup->nr_dying_subsys[ss->id]++; - } } /** @@ -5995,12 +6018,13 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initiate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css_finish(). */ static void css_killed_work_fn(struct work_struct *work) { - struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys_state *css; + + css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); cgroup_lock(); @@ -6021,22 +6045,21 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_offline_wq, &css->destroy_work); + INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn); + queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork); } } /** - * kill_css - destroy a css - * @css: css to destroy + * kill_css_sync - synchronous half of css teardown + * @css: css being killed * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget_online() is guaranteed to fail and when - * the reference count reaches zero, @css will be released. + * See cgroup_destroy_locked(). */ -static void kill_css(struct cgroup_subsys_state *css) +static void kill_css_sync(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) @@ -6056,64 +6079,100 @@ static void kill_css(struct cgroup_subsys_state *css) */ css_clear_dir(css); + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } +} + +/** + * kill_css_finish - deferred half of css teardown + * @css: css being killed + * + * See cgroup_destroy_locked(). + */ +static void kill_css_finish(struct cgroup_subsys_state *css) +{ + lockdep_assert_held(&cgroup_mutex); + /* - * Killing would put the base ref, but we need to keep it alive - * until after ->css_offline(). + * Skip on re-entry: cgroup_apply_control_disable() may have killed @css + * earlier. cgroup_destroy_locked() can still walk it because + * offline_css() (which NULLs cgrp->subsys[ssid]) runs async. + */ + if (percpu_ref_is_dying(&css->refcnt)) + return; + + /* + * Killing would put the base ref, but we need to keep it alive until + * after ->css_offline(). */ css_get(css); /* - * cgroup core guarantees that, by the time ->css_offline() is - * invoked, no new css reference will be given out via - * css_tryget_online(). We can't simply call percpu_ref_kill() and - * proceed to offlining css's because percpu_ref_kill() doesn't - * guarantee that the ref is seen as killed on all CPUs on return. + * cgroup core guarantees that, by the time ->css_offline() is invoked, + * no new css reference will be given out via css_tryget_online(). We + * can't simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen as + * killed on all CPUs on return. * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. + * Use percpu_ref_kill_and_confirm() to get notifications as each css is + * confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** - * cgroup_destroy_locked - the first stage of cgroup destruction + * cgroup_destroy_locked - destroy @cgrp (called on rmdir) * @cgrp: cgroup to be destroyed * - * css's make use of percpu refcnts whose killing latency shouldn't be - * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget_online() won't succeed by the time - * ->css_offline() is invoked. To satisfy all the requirements, - * destruction is implemented in the following two steps. - * - * s1. Verify @cgrp can be destroyed and mark it dying. Remove all - * userland visible parts and start killing the percpu refcnts of - * css's. Set up so that the next stage will be kicked off once all - * the percpu refcnts are confirmed to be killed. - * - * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the - * rest of destruction. Once all cgroup references are gone, the - * cgroup is RCU-freed. - * - * This function implements s1. After this step, @cgrp is gone as far as - * the userland is concerned and a new cgroup with the same name may be - * created. As cgroup doesn't care about the names internally, this - * doesn't cause any problem. + * Tear down @cgrp on behalf of rmdir. Constraints: + * + * - Userspace: rmdir must succeed when cgroup.procs and friends are empty. + * + * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's + * subtree is still doing kernel work. A task hidden from cgroup.procs (past + * exit_signals() with signal->live cleared) can still schedule, allocate, and + * consume resources until its final context switch. Dying descendants in the + * subtree can host such tasks too. + * + * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs. + * + * The destruction runs in three parts: + * + * - This function: synchronous user-visible state teardown plus kill_css_sync() + * on each subsystem css. + * + * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on + * each subsystem css. Fires once @cgrp's subtree is fully drained, either + * inline here or from cgroup_update_populated(). + * + * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> + * ->css_offline() -> release/free. + * + * Return 0 on success, -EBUSY if a userspace-visible task or an online child + * remains. */ static int cgroup_destroy_locked(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; int ssid, ret; lockdep_assert_held(&cgroup_mutex); - /* - * Only migration can raise populated from zero and we're already - * holding cgroup_mutex. - */ - if (cgroup_is_populated(cgrp)) + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + if (task) return -EBUSY; /* @@ -6137,9 +6196,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) link->cset->dead = true; spin_unlock_irq(&css_set_lock); - /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) - kill_css(css); + kill_css_sync(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); @@ -6170,79 +6228,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); + if (!cgroup_is_populated(cgrp)) + cgroup_finish_destroy(cgrp); + return 0; }; /** - * cgroup_drain_dying - wait for dying tasks to leave before rmdir - * @cgrp: the cgroup being removed - * - * cgroup.procs and cgroup.threads use css_task_iter which filters out - * PF_EXITING tasks so that userspace doesn't see tasks that have already been - * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the - * cgroup has non-empty css_sets - is only updated when dying tasks pass through - * cgroup_task_dead() in finish_task_switch(). This creates a window where - * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir - * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no - * tasks. - * - * This function aligns cgroup_has_tasks() with what userspace can observe. If - * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are - * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the - * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. - * - * This function only concerns itself with this cgroup's own dying tasks. - * Whether the cgroup has children is cgroup_destroy_locked()'s problem. + * cgroup_finish_destroy - deferred half of @cgrp destruction + * @cgrp: cgroup whose subtree just became empty * - * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we - * retry the full check from scratch. - * - * Must be called with cgroup_mutex held. + * See cgroup_destroy_locked() for the rationale. */ -static int cgroup_drain_dying(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +static void cgroup_finish_destroy(struct cgroup *cgrp) { - struct css_task_iter it; - struct task_struct *task; - DEFINE_WAIT(wait); + struct cgroup_subsys_state *css; + int ssid; lockdep_assert_held(&cgroup_mutex); -retry: - if (!cgroup_has_tasks(cgrp)) - return 0; - /* Same iterator as cgroup.threads - if any task is visible, it's busy */ - css_task_iter_start(&cgrp->self, 0, &it); - task = css_task_iter_next(&it); - css_task_iter_end(&it); - - if (task) - return -EBUSY; - - /* - * All remaining tasks are PF_EXITING and will pass through - * cgroup_task_dead() shortly. Wait for a kick and retry. - * - * cgroup_has_tasks() can't transition from false to true while we're - * holding cgroup_mutex, but the true to false transition happens - * under css_set_lock (via cgroup_task_dead()). We must retest and - * prepare_to_wait() under css_set_lock. Otherwise, the transition - * can happen between our first test and prepare_to_wait(), and we - * sleep with no one to wake us. - */ - spin_lock_irq(&css_set_lock); - if (!cgroup_has_tasks(cgrp)) { - spin_unlock_irq(&css_set_lock); - return 0; - } - prepare_to_wait(&cgrp->dying_populated_waitq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - schedule(); - finish_wait(&cgrp->dying_populated_waitq, &wait); - mutex_lock(&cgroup_mutex); - goto retry; + for_each_css(css, ssid, cgrp) + kill_css_finish(css); } int cgroup_rmdir(struct kernfs_node *kn) @@ -6254,12 +6260,9 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_drain_dying(cgrp); - if (!ret) { - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); - } + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; @@ -7019,7 +7022,6 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { - struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7033,11 +7035,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); - /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - if (waitqueue_active(&link->cgrp->dying_populated_waitq)) - wake_up(&link->cgrp->dying_populated_waitq); - if (dl_task(tsk)) dec_dl_tasks_cs(tsk); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index fd7d19842ded..bb4e692bea30 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -168,6 +168,11 @@ struct cpuset { int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; + /* + * CPU used for temporary DL bandwidth allocation during attach; + * -1 if no DL bandwidth was allocated in the current attach. + */ + int dl_bw_cpu; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1335e437098e..e3a081a07c6d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -288,6 +288,7 @@ struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .dl_bw_cpu = -1, }; /** @@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) if (!trial) return NULL; + trial->dl_bw_cpu = -1; + /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, @@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; + cs->dl_bw_cpu = -1; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ @@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) reset_migrate_dl_data(cs); goto out_unlock; } + + cs->dl_bw_cpu = cpu; } out_success: @@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); - if (cs->nr_migrate_dl_tasks) { - int cpu = cpumask_any(cs->effective_cpus); + if (cs->dl_bw_cpu >= 0) + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); - dl_bw_free(cpu, cs->sum_migrate_dl_bw); + if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); - } mutex_unlock(&cpuset_mutex); } diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 9967fb25c563..4fdab4cf49e0 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, ret = PTR_ERR(rpool); goto err; } else { - new = rpool->resources[index].usage + 1; + new = (s64)rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0b9495187fba..b276504c1c6b 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -704,7 +704,7 @@ return_normal: if (ks->send_ready) atomic_set(ks->send_ready, 1); - /* Signal the other CPUs to enter kgdb_wait() */ + /* Signal the other CPUs to enter the debug trap handler */ else if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(); #endif diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index f586afd76c80..e271a436d60e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -517,7 +517,7 @@ static void gdb_get_regs_helper(struct kgdb_state *ks) /* * All threads that don't have debuggerinfo should be * in schedule() sleeping, since all other CPUs - * are in kgdb_wait, and thus have debuggerinfo. + * are in kgdb_cpu_enter(), and thus have debuggerinfo. */ if (local_debuggerinfo) { pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f9cacfa7cb8..bcaf175f3ded 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7006,6 +7006,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) } static void perf_pmu_output_stop(struct perf_event *event); +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb); /* * A buffer can be mmap()ed multiple times; either directly through the same @@ -7021,8 +7022,6 @@ static void perf_mmap_close(struct vm_area_struct *vma) mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); bool detach_rest = false; /* FIXIES vs perf_pmu_unregister() */ @@ -7117,11 +7116,7 @@ again: * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ - - atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, - &mmap_user->locked_vm); - atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); - free_uid(mmap_user); + perf_mmap_unaccount(vma, rb); out_put: ring_buffer_put(rb); /* could be last */ @@ -7261,6 +7256,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb) +{ + struct user_struct *user = rb->mmap_user; + + atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked, + &user->locked_vm); + atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm); +} + static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, unsigned long nr_pages) { @@ -7323,8 +7327,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - refcount_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); @@ -7474,16 +7476,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mapped(event, vma->vm_mm); /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). + * Try to map it into the page table. On fail undo the above, + * as the callsite expects full cleanup in this case and + * therefore does not invoke vmops::close(). */ ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + if (likely(!ret)) + return 0; + + /* Error path */ + + /* + * If this is the first mmap(), then event->mmap_count should + * be stable at 1. It is only modified by: + * perf_mmap_{open,close}() and perf_mmap(). + * + * The former are not possible because this mmap() hasn't been + * successful yet, and the latter is serialized by + * event->mmap_mutex which we still hold (note that mmap_lock + * is not strictly sufficient here, because the event fd can + * be passed to another process through trivial means like + * fork(), leading to concurrent mmap() from different mm). + * + * Make sure to remove event->rb before releasing + * event->mmap_mutex, such that any concurrent mmap() will not + * attempt use this failed buffer. + */ + if (refcount_read(&event->mmap_count) == 1) { + /* + * Minimal perf_mmap_close(); there can't be AUX or + * other events on account of this being the first. + */ + mapped = get_mapped(event, event_unmapped); + if (mapped) + mapped(event, vma->vm_mm); + perf_mmap_unaccount(vma, event->rb); + ring_buffer_attach(event, NULL); /* drops last rb->refcount */ + refcount_set(&event->mmap_count, 0); + return ret; + } + + /* + * Otherwise this is an already existing buffer, and there is + * no race vs first exposure, so fall-through and call + * perf_mmap_close(). + */ } + perf_mmap_close(vma); return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d9cc57083091..c03c4f2eea57 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) struct perf_buffer *rb; rb = container_of(rcu_head, struct perf_buffer, rcu_head); + free_uid(rb->mmap_user); rb_free(rb); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3e7de2661417..9fe92161715e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) rb->paused = 1; mutex_init(&rb->aux_mutex); + rb->mmap_user = get_current_user(); + refcount_set(&rb->mmap_count, 1); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) diff --git a/kernel/fork.c b/kernel/fork.c index f1ad69c6dc2d..5f3fdfdb14c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1951,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p) static bool need_futex_hash_allocate_default(u64 clone_flags) { - if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) - return false; - return true; + /* + * Allocate a default futex hash for any sibling that will + * share the parent's mm, except vfork. + */ + return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM; } /* @@ -2380,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cancel_cgroup; - /* - * Allocate a default futex hash for the user process once the first - * thread spawns. - */ if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index d818b4d47f1b..b597cb3d17fc 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -319,8 +319,11 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EINVAL; /* Ensure that this does not race against an early wakeup */ - if (!futex_requeue_pi_prepare(top_waiter, NULL)) + if (!futex_requeue_pi_prepare(top_waiter, NULL)) { + plist_del(&top_waiter->list, &hb1->chain); + futex_hb_waiters_dec(hb1); return -EAGAIN; + } /* * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit @@ -722,10 +725,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /* * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. + * Conditionally unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &hb->chain); - futex_hb_waiters_dec(hb); + if (!plist_node_empty(&q->list)) { + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); + } /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 532f455c5d4f..18509d8082ea 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -18,7 +18,9 @@ #include <linux/kexec.h> #include <linux/kexec_handover.h> #include <linux/kho_radix_tree.h> +#include <linux/utsname.h> #include <linux/kho/abi/kexec_handover.h> +#include <linux/kho/abi/kexec_metadata.h> #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> @@ -724,12 +726,13 @@ err_disable_kho: } /** - * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * kho_add_subtree - record the physical address of a sub blob in KHO root tree. * @name: name of the sub tree. - * @fdt: the sub tree blob. + * @blob: the sub tree blob. + * @size: size of the blob in bytes. * * Creates a new child node named @name in KHO root FDT and records - * the physical address of @fdt. The pages of @fdt must also be preserved + * the physical address of @blob. The pages of @blob must also be preserved * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at @@ -738,10 +741,11 @@ err_disable_kho: * * Return: 0 on success, error code on failure */ -int kho_add_subtree(const char *name, void *fdt) +int kho_add_subtree(const char *name, void *blob, size_t size) { - phys_addr_t phys = virt_to_phys(fdt); + phys_addr_t phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; + u64 size_u64 = size; int err = -ENOMEM; int off, fdt_err; @@ -758,13 +762,24 @@ int kho_add_subtree(const char *name, void *fdt) goto out_pack; } - err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, - &phys, sizeof(phys)); - if (err < 0) - goto out_pack; + fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, + &phys, sizeof(phys)); + if (fdt_err < 0) + goto out_del_node; - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, + &size_u64, sizeof(size_u64)); + if (fdt_err < 0) + goto out_del_node; + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob, + size, false)); + + err = 0; + goto out_pack; + +out_del_node: + fdt_del_node(root_fdt, off); out_pack: fdt_pack(root_fdt); @@ -772,9 +787,9 @@ out_pack: } EXPORT_SYMBOL_GPL(kho_add_subtree); -void kho_remove_subtree(void *fdt) +void kho_remove_subtree(void *blob) { - phys_addr_t target_phys = virt_to_phys(fdt); + phys_addr_t target_phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; int off; int err; @@ -790,13 +805,13 @@ void kho_remove_subtree(void *fdt) const u64 *val; int len; - val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); + val = fdt_getprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(phys_addr_t)) continue; if ((phys_addr_t)*val == target_phys) { fdt_del_node(root_fdt, off); - kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + kho_debugfs_blob_remove(&kho_out.dbg, blob); break; } } @@ -1260,6 +1275,8 @@ EXPORT_SYMBOL_GPL(kho_restore_free); struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; struct kho_debugfs dbg; }; @@ -1292,16 +1309,17 @@ bool is_kho_boot(void) EXPORT_SYMBOL_GPL(is_kho_boot); /** - * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. - * @name: the name of the sub FDT passed to kho_add_subtree(). - * @phys: if found, the physical address of the sub FDT is stored in @phys. + * kho_retrieve_subtree - retrieve a preserved sub blob by its name. + * @name: the name of the sub blob passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub blob is stored in @phys. + * @size: if not NULL and found, the size of the sub blob is stored in @size. * - * Retrieve a preserved sub FDT named @name and store its physical - * address in @phys. + * Retrieve a preserved sub blob named @name and store its physical + * address in @phys and optionally its size in @size. * * Return: 0 on success, error code on failure */ -int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size) { const void *fdt = kho_get_fdt(); const u64 *val; @@ -1317,12 +1335,22 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) if (offset < 0) return -ENOENT; - val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); + val = fdt_getprop(fdt, offset, KHO_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(*val)) return -EINVAL; *phys = (phys_addr_t)*val; + val = fdt_getprop(fdt, offset, KHO_SUB_TREE_SIZE_PROP_NAME, &len); + if (!val || len != sizeof(*val)) { + pr_warn("broken KHO subnode '%s': missing or invalid blob-size property\n", + name); + return -EINVAL; + } + + if (size) + *size = (size_t)*val; + return 0; } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); @@ -1373,6 +1401,96 @@ static __init int kho_out_fdt_setup(void) return err; } +static void __init kho_in_kexec_metadata(void) +{ + struct kho_kexec_metadata *metadata; + phys_addr_t metadata_phys; + size_t blob_size; + int err; + + err = kho_retrieve_subtree(KHO_METADATA_NODE_NAME, &metadata_phys, + &blob_size); + if (err) + /* This is fine, previous kernel didn't export metadata */ + return; + + /* Check that, at least, "version" is present */ + if (blob_size < sizeof(u32)) { + pr_warn("kexec-metadata blob too small (%zu bytes)\n", + blob_size); + return; + } + + metadata = phys_to_virt(metadata_phys); + + if (metadata->version != KHO_KEXEC_METADATA_VERSION) { + pr_warn("kexec-metadata version %u not supported (expected %u)\n", + metadata->version, KHO_KEXEC_METADATA_VERSION); + return; + } + + if (blob_size < sizeof(*metadata)) { + pr_warn("kexec-metadata blob too small for v%u (%zu < %zu)\n", + metadata->version, blob_size, sizeof(*metadata)); + return; + } + + /* + * Copy data to the kernel structure that will persist during + * kernel lifetime. + */ + kho_in.kexec_count = metadata->kexec_count; + strscpy(kho_in.previous_release, metadata->previous_release, + sizeof(kho_in.previous_release)); + + pr_info("exec from: %s (count %u)\n", + kho_in.previous_release, kho_in.kexec_count); +} + +/* + * Create kexec metadata to pass kernel version and boot count to the + * next kernel. This keeps the core KHO ABI minimal and allows the + * metadata format to evolve independently. + */ +static __init int kho_out_kexec_metadata(void) +{ + struct kho_kexec_metadata *metadata; + int err; + + metadata = kho_alloc_preserve(sizeof(*metadata)); + if (IS_ERR(metadata)) + return PTR_ERR(metadata); + + metadata->version = KHO_KEXEC_METADATA_VERSION; + strscpy(metadata->previous_release, init_uts_ns.name.release, + sizeof(metadata->previous_release)); + /* kho_in.kexec_count is set to 0 on cold boot */ + metadata->kexec_count = kho_in.kexec_count + 1; + + err = kho_add_subtree(KHO_METADATA_NODE_NAME, metadata, + sizeof(*metadata)); + if (err) + kho_unpreserve_free(metadata); + + return err; +} + +static int __init kho_kexec_metadata_init(const void *fdt) +{ + int err; + + if (fdt) + kho_in_kexec_metadata(); + + /* Populate kexec metadata for the possible next kexec */ + err = kho_out_kexec_metadata(); + if (err) + pr_warn("failed to initialize kexec-metadata subtree: %d\n", + err); + + return err; +} + static __init int kho_init(void) { struct kho_radix_tree *tree = &kho_out.radix_tree; @@ -1406,6 +1524,10 @@ static __init int kho_init(void) if (err) goto err_free_fdt; + err = kho_kexec_metadata_init(fdt); + if (err) + goto err_free_fdt; + if (fdt) { kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; @@ -1430,8 +1552,9 @@ static __init int kho_init(void) init_cma_reserved_pageblock(pfn_to_page(pfn)); } - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - kho_out.fdt, true)); + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, "fdt", + kho_out.fdt, + fdt_totalsize(kho_out.fdt), true)); return 0; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index acf368222682..257ee8a52be6 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -24,8 +24,9 @@ struct fdt_debugfs { struct dentry *file; }; -static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) +static int __kho_debugfs_blob_add(struct list_head *list, struct dentry *dir, + const char *name, const void *blob, + size_t size) { struct fdt_debugfs *f; struct dentry *file; @@ -34,8 +35,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, if (!f) return -ENOMEM; - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); + f->wrapper.data = (void *)blob; + f->wrapper.size = size; file = debugfs_create_blob(name, 0400, dir, &f->wrapper); if (IS_ERR(file)) { @@ -49,8 +50,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, return 0; } -int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root) +int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name, + const void *blob, size_t size, bool root) { struct dentry *dir; @@ -59,15 +60,15 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, else dir = dbg->sub_fdt_dir; - return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); + return __kho_debugfs_blob_add(&dbg->fdt_list, dir, name, blob, size); } -void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) +void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob) { struct fdt_debugfs *ff; list_for_each_entry(ff, &dbg->fdt_list, list) { - if (ff->wrapper.data == fdt) { + if (ff->wrapper.data == blob) { debugfs_remove(ff->file); list_del(&ff->list); kfree(ff); @@ -113,28 +114,42 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) goto err_rmdir; } - err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + err = __kho_debugfs_blob_add(&dbg->fdt_list, dir, "fdt", fdt, + fdt_totalsize(fdt)); if (err) goto err_rmdir; fdt_for_each_subnode(child, fdt, 0) { int len = 0; const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; + const u64 *blob_phys; + const u64 *blob_size; + void *blob; - fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); - if (!fdt_phys) + blob_phys = fdt_getprop(fdt, child, + KHO_SUB_TREE_PROP_NAME, &len); + if (!blob_phys) continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node %s prop fdt has invalid length: %d\n", - name, len); + if (len != sizeof(*blob_phys)) { + pr_warn("node %s prop %s has invalid length: %d\n", + name, KHO_SUB_TREE_PROP_NAME, len); continue; } - err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); + + blob_size = fdt_getprop(fdt, child, + KHO_SUB_TREE_SIZE_PROP_NAME, &len); + if (!blob_size || len != sizeof(*blob_size)) { + pr_warn("node %s missing or invalid %s property\n", + name, KHO_SUB_TREE_SIZE_PROP_NAME); + continue; + } + + blob = phys_to_virt(*blob_phys); + err = __kho_debugfs_blob_add(&dbg->fdt_list, sub_fdt_dir, name, + blob, *blob_size); if (err) { - pr_warn("failed to add fdt %s to debugfs: %pe\n", name, - ERR_PTR(err)); + pr_warn("failed to add blob %s to debugfs: %pe\n", + name, ERR_PTR(err)); continue; } } diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 9a832a35254c..0399ff107775 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -26,18 +26,19 @@ extern unsigned int kho_scratch_cnt; int kho_debugfs_init(void); void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); -int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root); -void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); +int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name, + const void *blob, size_t size, bool root); +void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob); #else static inline int kho_debugfs_init(void) { return 0; } static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) { } static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } -static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root) { return 0; } -static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, - void *fdt) { } +static inline int kho_debugfs_blob_add(struct kho_debugfs *dbg, + const char *name, const void *blob, + size_t size, bool root) { return 0; } +static inline void kho_debugfs_blob_remove(struct kho_debugfs *dbg, + void *blob) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ #ifdef CONFIG_KEXEC_HANDOVER_DEBUG diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 84ac728d63ba..803f51c84275 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -54,6 +54,7 @@ #include <linux/liveupdate.h> #include <linux/miscdevice.h> #include <linux/mm.h> +#include <linux/rwsem.h> #include <linux/sizes.h> #include <linux/string.h> #include <linux/unaligned.h> @@ -68,6 +69,11 @@ static struct { u64 liveupdate_num; } luo_global; +/* + * luo_register_rwlock - Protects registration of file handlers and FLBs. + */ +DECLARE_RWSEM(luo_register_rwlock); + static int __init early_liveupdate_param(char *buf) { return kstrtobool(buf, &luo_global.enabled); @@ -88,7 +94,7 @@ static int __init luo_early_startup(void) } /* Retrieve LUO subtree, and verify its format. */ - err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys, NULL); if (err) { if (err != -ENOENT) { pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", @@ -172,7 +178,8 @@ static int __init luo_fdt_setup(void) if (err) goto exit_free; - err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out, + fdt_totalsize(fdt_out)); if (err) goto exit_free; luo_global.fdt_out = fdt_out; diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 5acee4174bf0..a0a419085e28 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -108,12 +108,16 @@ #include <linux/liveupdate.h> #include <linux/module.h> #include <linux/sizes.h> +#include <linux/xarray.h> #include <linux/slab.h> #include <linux/string.h> #include "luo_internal.h" static LIST_HEAD(luo_file_handler_list); +/* Keep track of files being preserved by LUO */ +static DEFINE_XARRAY(luo_preserved_files); + /* 2 4K pages, give space for 128 files per file_set */ #define LUO_FILE_PGCNT 2ul #define LUO_FILE_MAX \ @@ -203,6 +207,12 @@ static void luo_free_files_mem(struct luo_file_set *file_set) file_set->files = NULL; } +static unsigned long luo_get_id(struct liveupdate_file_handler *fh, + struct file *file) +{ + return fh->ops->get_id ? fh->ops->get_id(file) : (unsigned long)file; +} + static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) { struct luo_file *iter; @@ -248,6 +258,7 @@ static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) * Context: Can be called from an ioctl handler during normal system operation. * Return: 0 on success. Returns a negative errno on failure: * -EEXIST if the token is already used. + * -EBUSY if the file descriptor is already preserved by another session. * -EBADF if the file descriptor is invalid. * -ENOSPC if the file_set is full. * -ENOENT if no compatible handler is found. @@ -277,20 +288,28 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) goto err_fput; err = -ENOENT; + down_read(&luo_register_rwlock); list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (fh->ops->can_preserve(fh, file)) { - err = 0; + if (try_module_get(fh->ops->owner)) + err = 0; break; } } + up_read(&luo_register_rwlock); /* err is still -ENOENT if no handler was found */ if (err) goto err_free_files_mem; + err = xa_insert(&luo_preserved_files, luo_get_id(fh, file), + file, GFP_KERNEL); + if (err) + goto err_module_put; + err = luo_flb_file_preserve(fh); if (err) - goto err_free_files_mem; + goto err_erase_xa; luo_file = kzalloc_obj(*luo_file); if (!luo_file) { @@ -320,6 +339,10 @@ err_kfree: kfree(luo_file); err_flb_unpreserve: luo_flb_file_unpreserve(fh); +err_erase_xa: + xa_erase(&luo_preserved_files, luo_get_id(fh, file)); +err_module_put: + module_put(fh->ops->owner); err_free_files_mem: luo_free_files_mem(file_set); err_fput: @@ -362,7 +385,10 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) args.private_data = luo_file->private_data; luo_file->fh->ops->unpreserve(&args); luo_flb_file_unpreserve(luo_file->fh); + module_put(luo_file->fh->ops->owner); + xa_erase(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file)); list_del(&luo_file->list); file_set->count--; @@ -606,6 +632,11 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, luo_file->file = args.file; /* Get reference so we can keep this file in LUO until finish */ get_file(luo_file->file); + + WARN_ON(xa_insert(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file), + luo_file->file, GFP_KERNEL)); + *filep = luo_file->file; luo_file->retrieve_status = 1; @@ -646,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, luo_file->fh->ops->finish(&args); luo_flb_file_finish(luo_file->fh); + module_put(luo_file->fh->ops->owner); } /** @@ -701,8 +733,11 @@ int luo_file_finish(struct luo_file_set *file_set) luo_file_finish_one(file_set, luo_file); - if (luo_file->file) + if (luo_file->file) { + xa_erase(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file)); fput(luo_file->file); + } list_del(&luo_file->list); file_set->count--; mutex_destroy(&luo_file->mutex); @@ -777,22 +812,28 @@ int luo_file_deserialize(struct luo_file_set *file_set, bool handler_found = false; struct luo_file *luo_file; + down_read(&luo_register_rwlock); list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (!strcmp(fh->compatible, file_ser[i].compatible)) { - handler_found = true; + if (try_module_get(fh->ops->owner)) + handler_found = true; break; } } + up_read(&luo_register_rwlock); if (!handler_found) { - pr_warn("No registered handler for compatible '%s'\n", + pr_warn("No registered handler for compatible '%.*s'\n", + (int)sizeof(file_ser[i].compatible), file_ser[i].compatible); return -ENOENT; } luo_file = kzalloc_obj(*luo_file); - if (!luo_file) + if (!luo_file) { + module_put(fh->ops->owner); return -ENOMEM; + } luo_file->fh = fh; luo_file->file = NULL; @@ -842,41 +883,28 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) return -EINVAL; } - /* - * Ensure the system is quiescent (no active sessions). - * This prevents registering new handlers while sessions are active or - * while deserialization is in progress. - */ - if (!luo_session_quiesce()) - return -EBUSY; - + down_write(&luo_register_rwlock); /* Check for duplicate compatible strings */ list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) { if (!strcmp(fh_iter->compatible, fh->compatible)) { pr_err("File handler registration failed: Compatible string '%s' already registered.\n", fh->compatible); err = -EEXIST; - goto err_resume; + goto err_unlock; } } - /* Pin the module implementing the handler */ - if (!try_module_get(fh->ops->owner)) { - err = -EAGAIN; - goto err_resume; - } - INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list)); INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); - luo_session_resume(); + up_write(&luo_register_rwlock); liveupdate_test_register(fh); return 0; -err_resume: - luo_session_resume(); +err_unlock: + up_write(&luo_register_rwlock); return err; } @@ -886,41 +914,13 @@ err_resume: * * Unregisters the file handler from the liveupdate core. This function * reverses the operations of liveupdate_register_file_handler(). - * - * It ensures safe removal by checking that: - * No live update session is currently in progress. - * No FLB registered with this file handler. - * - * If the unregistration fails, the internal test state is reverted. - * - * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live - * update is in progress, can't quiesce live update or FLB is registred with - * this file handler. */ -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { - int err = -EBUSY; - if (!liveupdate_enabled()) - return -EOPNOTSUPP; - - liveupdate_test_unregister(fh); - - if (!luo_session_quiesce()) - goto err_register; - - if (!list_empty(&ACCESS_PRIVATE(fh, flb_list))) - goto err_resume; + return; + guard(rwsem_write)(&luo_register_rwlock); + luo_flb_unregister_all(fh); list_del(&ACCESS_PRIVATE(fh, list)); - module_put(fh->ops->owner); - luo_session_resume(); - - return 0; - -err_resume: - luo_session_resume(); -err_register: - liveupdate_test_register(fh); - return err; } diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index f52e8114837e..00f5494812c4 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -89,13 +89,18 @@ struct luo_flb_link { static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb) { struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private); + static DEFINE_SPINLOCK(luo_flb_init_lock); + if (smp_load_acquire(&private->initialized)) + return private; + + guard(spinlock)(&luo_flb_init_lock); if (!private->initialized) { mutex_init(&private->incoming.lock); mutex_init(&private->outgoing.lock); INIT_LIST_HEAD(&private->list); private->users = 0; - private->initialized = true; + smp_store_release(&private->initialized, true); } return private; @@ -110,10 +115,15 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) struct liveupdate_flb_op_args args = {0}; int err; + if (!try_module_get(flb->ops->owner)) + return -ENODEV; + args.flb = flb; err = flb->ops->preserve(&args); - if (err) + if (err) { + module_put(flb->ops->owner); return err; + } private->outgoing.data = args.data; private->outgoing.obj = args.obj; } @@ -141,6 +151,7 @@ static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) private->outgoing.data = 0; private->outgoing.obj = NULL; + module_put(flb->ops->owner); } } } @@ -176,12 +187,17 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) if (!found) return -ENOENT; + if (!try_module_get(flb->ops->owner)) + return -ENODEV; + args.flb = flb; args.data = private->incoming.data; err = flb->ops->retrieve(&args); - if (err) + if (err) { + module_put(flb->ops->owner); return err; + } private->incoming.obj = args.obj; private->incoming.retrieved = true; @@ -215,6 +231,7 @@ static void luo_flb_file_finish_one(struct liveupdate_flb *flb) private->incoming.data = 0; private->incoming.obj = NULL; private->incoming.finished = true; + module_put(flb->ops->owner); } } } @@ -240,17 +257,20 @@ int luo_flb_file_preserve(struct liveupdate_file_handler *fh) struct luo_flb_link *iter; int err = 0; + down_read(&luo_register_rwlock); list_for_each_entry(iter, flb_list, list) { err = luo_flb_file_preserve_one(iter->flb); if (err) goto exit_err; } + up_read(&luo_register_rwlock); return 0; exit_err: list_for_each_entry_continue_reverse(iter, flb_list, list) luo_flb_file_unpreserve_one(iter->flb); + up_read(&luo_register_rwlock); return err; } @@ -272,6 +292,7 @@ void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh) struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); struct luo_flb_link *iter; + guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) luo_flb_file_unpreserve_one(iter->flb); } @@ -292,10 +313,67 @@ void luo_flb_file_finish(struct liveupdate_file_handler *fh) struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); struct luo_flb_link *iter; + guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) luo_flb_file_finish_one(iter->flb); } +static void luo_flb_unregister_one(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + bool found = false; + + /* Find and remove the link from the file handler's list */ + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) { + list_del(&iter->list); + kfree(iter); + found = true; + break; + } + } + + if (!found) { + pr_warn("Failed to unregister FLB '%s': not found in file handler '%s'\n", + flb->compatible, fh->compatible); + return; + } + + private->users--; + + /* + * If this is the last file-handler with which we are registred, remove + * from the global list. + */ + if (!private->users) { + list_del_init(&private->list); + luo_flb_global.count--; + } +} + +/** + * luo_flb_unregister_all - Unregister all FLBs associated with a file handler. + * @fh: The file handler whose FLBs should be unregistered. + * + * This function iterates through the list of FLBs associated with the given + * file handler and unregisters them all one by one. + */ +void luo_flb_unregister_all(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter, *tmp; + + if (!liveupdate_enabled()) + return; + + lockdep_assert_held_write(&luo_register_rwlock); + list_for_each_entry_safe(iter, tmp, flb_list, list) + luo_flb_unregister_one(fh, iter->flb); +} + /** * liveupdate_register_flb - Associate an FLB with a file handler and register it globally. * @fh: The file handler that will now depend on the FLB. @@ -326,7 +404,6 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, struct luo_flb_link *link __free(kfree) = NULL; struct liveupdate_flb *gflb; struct luo_flb_link *iter; - int err; if (!liveupdate_enabled()) return -EOPNOTSUPP; @@ -347,19 +424,12 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, if (!link) return -ENOMEM; - /* - * Ensure the system is quiescent (no active sessions). - * This acts as a global lock for registration: no other thread can - * be in this section, and no sessions can be creating/using FDs. - */ - if (!luo_session_quiesce()) - return -EBUSY; + guard(rwsem_write)(&luo_register_rwlock); /* Check that this FLB is not already linked to this file handler */ - err = -EEXIST; list_for_each_entry(iter, flb_list, list) { if (iter->flb == flb) - goto err_resume; + return -EEXIST; } /* @@ -367,25 +437,16 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, * is registered */ if (!private->users) { - if (WARN_ON(!list_empty(&private->list))) { - err = -EINVAL; - goto err_resume; - } + if (WARN_ON(!list_empty(&private->list))) + return -EINVAL; - if (luo_flb_global.count == LUO_FLB_MAX) { - err = -ENOSPC; - goto err_resume; - } + if (luo_flb_global.count == LUO_FLB_MAX) + return -ENOSPC; /* Check that compatible string is unique in global list */ list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { if (!strcmp(gflb->compatible, flb->compatible)) - goto err_resume; - } - - if (!try_module_get(flb->ops->owner)) { - err = -EAGAIN; - goto err_resume; + return -EEXIST; } list_add_tail(&private->list, &luo_flb_global.list); @@ -396,13 +457,8 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, private->users++; link->flb = flb; list_add_tail(&no_free_ptr(link)->list, flb_list); - luo_session_resume(); return 0; - -err_resume: - luo_session_resume(); - return err; } /** @@ -418,63 +474,17 @@ err_resume: * the FLB is removed from the global registry and the reference to its * owner module (acquired during registration) is released. * - * Context: This function ensures the session is quiesced (no active FDs - * being created) during the update. It is typically called from a - * subsystem's module exit function. - * Return: 0 on success. - * -EOPNOTSUPP if live update is disabled. - * -EBUSY if the live update session is active and cannot be quiesced. - * -ENOENT if the FLB was not found in the file handler's list. + * Context: It is typically called from a subsystem's module exit function. */ -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { - struct luo_flb_private *private = luo_flb_get_private(flb); - struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); - struct luo_flb_link *iter; - int err = -ENOENT; - if (!liveupdate_enabled()) - return -EOPNOTSUPP; + return; - /* - * Ensure the system is quiescent (no active sessions). - * This acts as a global lock for unregistration. - */ - if (!luo_session_quiesce()) - return -EBUSY; + guard(rwsem_write)(&luo_register_rwlock); - /* Find and remove the link from the file handler's list */ - list_for_each_entry(iter, flb_list, list) { - if (iter->flb == flb) { - list_del(&iter->list); - kfree(iter); - err = 0; - break; - } - } - - if (err) - goto err_resume; - - private->users--; - /* - * If this is the last file-handler with which we are registred, remove - * from the global list, and relese module reference. - */ - if (!private->users) { - list_del_init(&private->list); - luo_flb_global.count--; - module_put(flb->ops->owner); - } - - luo_session_resume(); - - return 0; - -err_resume: - luo_session_resume(); - return err; + luo_flb_unregister_one(fh, flb); } /** @@ -492,7 +502,8 @@ err_resume: * * Return: 0 on success, or a negative errno on failure. -ENODATA means no * incoming FLB data, -ENOENT means specific flb not found in the incoming - * data, and -EOPNOTSUPP when live update is disabled or not configured. + * data, -ENODEV if the FLB's module is unloading, and -EOPNOTSUPP when + * live update is disabled or not configured. */ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) { @@ -638,6 +649,7 @@ void luo_flb_serialize(void) struct liveupdate_flb *gflb; int i = 0; + guard(rwsem_read)(&luo_register_rwlock); list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { struct luo_flb_private *private = luo_flb_get_private(gflb); diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 8083d8739b09..875844d7a41d 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -77,14 +77,14 @@ struct luo_session { struct mutex mutex; }; +extern struct rw_semaphore luo_register_rwlock; + int luo_session_create(const char *name, struct file **filep); int luo_session_retrieve(const char *name, struct file **filep); int __init luo_session_setup_outgoing(void *fdt); int __init luo_session_setup_incoming(void *fdt); int luo_session_serialize(void); int luo_session_deserialize(void); -bool luo_session_quiesce(void); -void luo_session_resume(void); int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd); void luo_file_unpreserve_files(struct luo_file_set *file_set); @@ -103,16 +103,15 @@ void luo_file_set_destroy(struct luo_file_set *file_set); int luo_flb_file_preserve(struct liveupdate_file_handler *fh); void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh); void luo_flb_file_finish(struct liveupdate_file_handler *fh); +void luo_flb_unregister_all(struct liveupdate_file_handler *fh); int __init luo_flb_setup_outgoing(void *fdt); int __init luo_flb_setup_incoming(void *fdt); void luo_flb_serialize(void); #ifdef CONFIG_LIVEUPDATE_TEST void liveupdate_test_register(struct liveupdate_file_handler *fh); -void liveupdate_test_unregister(struct liveupdate_file_handler *fh); #else static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { } -static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { } #endif #endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index 25ae704d7787..7a42385dabe2 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -514,11 +514,12 @@ int luo_session_deserialize(void) { struct luo_session_header *sh = &luo_session_global.incoming; static bool is_deserialized; - static int err; + static int saved_err; + int err; /* If has been deserialized, always return the same error code */ if (is_deserialized) - return err; + return saved_err; is_deserialized = true; if (!sh->active) @@ -544,9 +545,11 @@ int luo_session_deserialize(void) session = luo_session_alloc(sh->ser[i].name); if (IS_ERR(session)) { - pr_warn("Failed to allocate session [%s] during deserialization %pe\n", + pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n", + (int)sizeof(sh->ser[i].name), sh->ser[i].name, session); - return PTR_ERR(session); + err = PTR_ERR(session); + goto save_err; } err = luo_session_insert(sh, session); @@ -554,7 +557,7 @@ int luo_session_deserialize(void) pr_warn("Failed to insert session [%s] %pe\n", session->name, ERR_PTR(err)); luo_session_free(session); - return err; + goto save_err; } scoped_guard(mutex, &session->mutex) { @@ -564,7 +567,7 @@ int luo_session_deserialize(void) if (err) { pr_warn("Failed to deserialize files for session [%s] %pe\n", session->name, ERR_PTR(err)); - return err; + goto save_err; } } @@ -573,6 +576,9 @@ int luo_session_deserialize(void) sh->ser = NULL; return 0; +save_err: + saved_err = err; + return err; } int luo_session_serialize(void) @@ -606,46 +612,3 @@ err_undo: return err; } -/** - * luo_session_quiesce - Ensure no active sessions exist and lock session lists. - * - * Acquires exclusive write locks on both incoming and outgoing session lists. - * It then validates no sessions exist in either list. - * - * This mechanism is used during file handler un/registration to ensure that no - * sessions are currently using the handler, and no new sessions can be created - * while un/registration is in progress. - * - * This prevents registering new handlers while sessions are active or - * while deserialization is in progress. - * - * Return: - * true - System is quiescent (0 sessions) and locked. - * false - Active sessions exist. The locks are released internally. - */ -bool luo_session_quiesce(void) -{ - down_write(&luo_session_global.incoming.rwsem); - down_write(&luo_session_global.outgoing.rwsem); - - if (luo_session_global.incoming.count || - luo_session_global.outgoing.count) { - up_write(&luo_session_global.outgoing.rwsem); - up_write(&luo_session_global.incoming.rwsem); - return false; - } - - return true; -} - -/** - * luo_session_resume - Unlock session lists and resume normal activity. - * - * Releases the exclusive locks acquired by a successful call to - * luo_session_quiesce(). - */ -void luo_session_resume(void) -{ - up_write(&luo_session_global.outgoing.rwsem); - up_write(&luo_session_global.incoming.rwsem); -} diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 186b463fe326..09534628dc01 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -198,27 +198,43 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) } /* - * Add @waiter to a given location in the lock wait_list and set the - * FLAG_WAITERS flag if it's the first waiter. + * Add @waiter to the @lock wait_list and set the FLAG_WAITERS flag if it's + * the first waiter. + * + * When @pos, @waiter is added before the waiter indicated by @pos. Otherwise + * @waiter will be added to the tail of the list. */ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct mutex_waiter *first) + struct mutex_waiter *pos) __must_hold(&lock->wait_lock) { + struct mutex_waiter *first = lock->first_waiter; + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); - if (!first) - first = lock->first_waiter; + if (pos) { + /* + * Insert @waiter before @pos. + */ + list_add_tail(&waiter->list, &pos->list); + /* + * If @pos == @first, then @waiter will be the new first. + */ + if (pos == first) + lock->first_waiter = waiter; + return; + } if (first) { list_add_tail(&waiter->list, &first->list); - } else { - INIT_LIST_HEAD(&waiter->list); - lock->first_waiter = waiter; - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + return; } + + INIT_LIST_HEAD(&waiter->list); + lock->first_waiter = waiter; + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); } static void @@ -229,10 +245,8 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); lock->first_waiter = NULL; } else { - if (lock->first_waiter == waiter) { - lock->first_waiter = list_first_entry(&waiter->list, - struct mutex_waiter, list); - } + if (lock->first_waiter == waiter) + lock->first_waiter = list_next_entry(waiter, list); list_del(&waiter->list); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ccaba6148b61..4f386ea6c792 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1544,6 +1544,8 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * * Must be called with lock->wait_lock held and interrupts disabled. It must * have just failed to try_to_take_rt_mutex(). + * + * When invoked from rt_mutex_start_proxy_lock() waiter::task != current ! */ static void __sched remove_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) @@ -1551,14 +1553,15 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct task_struct *waiter_task = waiter->task; struct rt_mutex_base *next_lock; lockdep_assert_held(&lock->wait_lock); - raw_spin_lock(¤t->pi_lock); - rt_mutex_dequeue(lock, waiter); - current->pi_blocked_on = NULL; - raw_spin_unlock(¤t->pi_lock); + scoped_guard(raw_spinlock, &waiter_task->pi_lock) { + rt_mutex_dequeue(lock, waiter); + waiter_task->pi_blocked_on = NULL; + } /* * Only update priority if the waiter was the highest priority @@ -1594,7 +1597,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, - next_lock, NULL, current); + next_lock, NULL, waiter_task); raw_spin_lock_irq(&lock->wait_lock); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 016f0db892a5..6c12452097e1 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -6,6 +6,19 @@ #define MUTEX_WAITER mutex_waiter #define WAIT_LOCK wait_lock +/* + * +--------+ + * | first | + * +--------+ + * | + * v + * +----+ +----+ +----+ + * | W3 | <-> | W1 | <-> | W2 | + * +----+ +----+ +----+ + * ^ ^ + * +---------------------+ + */ + static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) __must_hold(&lock->wait_lock) @@ -13,26 +26,43 @@ __ww_waiter_first(struct mutex *lock) return lock->first_waiter; } +/* + * for (cur = __ww_waiter_first(); cur; cur = __ww_waiter_next()) + * + * Should iterate like: W1, W2, W3 + */ static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) __must_hold(&lock->wait_lock) { w = list_next_entry(w, list); + /* + * Terminate if the next entry is the first again, that has already + * been observed. + */ if (lock->first_waiter == w) return NULL; return w; } +/* + * for (cur = __ww_waiter_last(); cur; cur = __ww_waiter_prev()) + * + * Should iterate like: W3, W2, W1 + */ static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) __must_hold(&lock->wait_lock) { - w = list_prev_entry(w, list); + /* + * Terminate at the first entry, the previous entry of first is the + * last and that has already been observed. + */ if (lock->first_waiter == w) return NULL; - return w; + return list_prev_entry(w, list); } static inline struct mutex_waiter * diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 56c8e3d031f4..85c0c854b3ce 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -14,7 +14,7 @@ * * Data Structure * -------------- - * The printk_ringbuffer is made up of 3 internal ringbuffers: + * The printk_ringbuffer is made up of 2 internal ringbuffers: * * desc_ring * A ring of descriptors and their meta data (such as sequence number, @@ -224,7 +224,7 @@ * * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * - * prb_for_each_record(0, &test_rb, &seq, &r) { + * prb_for_each_record(0, &test_rb, seq, &r) { * if (info.seq != seq) * pr_warn("lost %llu records\n", info.seq - seq); * @@ -1302,23 +1302,26 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } - /* Sanity check. Data-less blocks were handled earlier. */ - if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size)) - return NULL; - /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { return NULL; } - /* A valid data block will always have at least an ID. */ - if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + /* + * A regular data block will always have an ID and at least + * 1 byte of data. Data-less blocks were handled earlier. + */ + if (WARN_ON_ONCE(*data_size <= sizeof(db->id))) return NULL; /* Subtract block ID space from size to reflect data size. */ *data_size -= sizeof(db->id); + /* Sanity check the max size of the regular data block. */ + if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size))) + return NULL; + return &db->data[0]; } @@ -1365,7 +1368,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * * WMB from _prb_commit:A to _prb_commit:B * matching - * MB If desc_reopen_last:A to prb_reserve_in_last:A + * MB from desc_reopen_last:A to prb_reserve_in_last:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ @@ -1770,9 +1773,9 @@ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) * * Relies on: * - * MB _prb_commit:B to prb_commit:A + * MB from _prb_commit:B to prb_commit:A * matching - * MB desc_reserve:D to desc_make_final:A + * MB from desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ @@ -2035,7 +2038,7 @@ u64 prb_first_seq(struct printk_ringbuffer *rb) * * MB from desc_push_tail:B to desc_reserve:F * matching - * RMB prb_first_seq:B to prb_first_seq:A + * RMB from prb_first_seq:B to prb_first_seq:A */ smp_rmb(); /* LMM(prb_first_seq:C) */ } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 1651b53ece34..2648da0a68b2 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -127,7 +127,7 @@ enum desc_state { }; #define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) -#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define _DESCS_COUNT(ct_bits) (1UL << (ct_bits)) #define DESC_SV_BITS BITS_PER_LONG #define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) #define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) @@ -388,7 +388,7 @@ for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each - * iteration. For the sequence number, @r->info->seq should be checked. + * iteration. For the sequence number, @i->seq should be checked. * * Context: Any context. */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 38d3ef540760..e75e3a5e312c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void) } __initcall(rseq_debugfs_init); -static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) -{ - return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); -} - static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { struct rseq __user *urseq = t->rseq.usrptr; @@ -258,14 +253,16 @@ efault: static void rseq_slowpath_update_usr(struct pt_regs *regs) { /* - * Preserve rseq state and user_irq state. The generic entry code - * clears user_irq on the way out, the non-generic entry - * architectures are not having user_irq. + * Preserve has_rseq and user_irq state. The generic entry code clears + * user_irq on the way out, the non-generic entry architectures are not + * setting user_irq. */ - const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; + const struct rseq_event evt_mask = { + .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, + .user_irq = true, + }; struct task_struct *t = current; struct rseq_ids ids; - u32 node_id; bool event; if (unlikely(t->flags & PF_EXITING)) @@ -301,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) if (!event) return; - node_id = cpu_to_node(ids.cpu_id); + ids.node_id = cpu_to_node(ids.cpu_id); - if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + if (unlikely(!rseq_update_usr(t, regs, &ids))) { /* * Clear the errors just in case this might survive magically, but * leave the rest intact. @@ -335,8 +332,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs) void __rseq_signal_deliver(int sig, struct pt_regs *regs) { rseq_stat_inc(rseq_stats.signal); + /* - * Don't update IDs, they are handled on exit to user if + * Don't update IDs yet, they are handled on exit to user if * necessary. The important thing is to abort a critical section of * the interrupted context as after this point the instruction * pointer in @regs points to the signal handler. @@ -349,6 +347,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs) current->rseq.event.error = 0; force_sigsegv(sig); } + + /* + * In legacy mode, force the update of IDs before returning to user + * space to stay compatible. + */ + if (!rseq_v2(current)) + rseq_force_update(); } /* @@ -384,19 +389,22 @@ void rseq_syscall(struct pt_regs *regs) static bool rseq_reset_ids(void) { - struct rseq_ids ids = { - .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, - .mm_cid = 0, - }; + struct rseq __user *rseq = current->rseq.usrptr; /* * If this fails, terminate it because this leaves the kernel in * stupid state as exit to user space will try to fixup the ids * again. */ - if (rseq_set_ids(current, &ids, 0)) - return true; + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(0, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0, &rseq->node_id, efault); + unsafe_put_user(0, &rseq->mm_cid, efault); + } + return true; +efault: force_sig(SIGSEGV); return false; } @@ -404,70 +412,29 @@ static bool rseq_reset_ids(void) /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 -/* - * sys_rseq - setup restartable sequences for caller thread. - */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) { u32 rseqfl = 0; + u8 version = 1; - if (flags & RSEQ_FLAG_UNREGISTER) { - if (flags & ~RSEQ_FLAG_UNREGISTER) - return -EINVAL; - /* Unregister rseq for current thread. */ - if (current->rseq.usrptr != rseq || !current->rseq.usrptr) - return -EINVAL; - if (rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - if (!rseq_reset_ids()) - return -EFAULT; - rseq_reset(current); - return 0; - } - - if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) - return -EINVAL; - - if (current->rseq.usrptr) { - /* - * If rseq is already registered, check whether - * the provided address differs from the prior - * one. - */ - if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - /* Already registered. */ - return -EBUSY; - } - - /* - * If there was no rseq previously registered, ensure the provided rseq - * is properly aligned, as communcated to user-space through the ELF - * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq - * size, the required alignment is the original struct rseq alignment. - * - * The rseq_len is required to be greater or equal to the original rseq - * size. In order to be valid, rseq_len is either the original rseq size, - * or large enough to contain all supported fields, as communicated to - * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. - */ - if (rseq_len < ORIG_RSEQ_SIZE || - (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || - rseq_len < offsetof(struct rseq, end)))) - return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; - if (rseq_slice_extension_enabled() && - (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + /* + * Architectures, which use the generic IRQ entry code (at least) enable + * registrations with a size greater than the original v1 fixed sized + * @rseq_len, which has been validated already to utilize the optimized + * v2 ABI mode which also enables extended RSEQ features beyond MMCID. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) + version = 2; + + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { + if (rseq_slice_extension_enabled()) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } } scoped_user_write_access(rseq, efault) { @@ -485,7 +452,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + + /* + * All fields past mm_cid are only valid for non-legacy v2 + * registrations. + */ + if (version > 1) { + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } /* @@ -501,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 #endif /* - * If rseq was previously inactive, and has just been - * registered, ensure the cpu_id_start and cpu_id fields - * are updated before returning to user-space. + * Ensure the cpu_id_start and cpu_id fields are updated before + * returning to user-space. */ - current->rseq.event.has_rseq = true; + current->rseq.event.has_rseq = version; rseq_force_update(); return 0; @@ -513,6 +487,80 @@ efault: return -EFAULT; } +static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) +{ + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) + return -EINVAL; + if (rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); + return 0; +} + +static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) +{ + /* + * If rseq is already registered, check whether the provided address + * differs from the prior one. + */ + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; +} + +static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) +{ + /* + * Ensure the provided rseq is properly aligned, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If + * rseq_len is the original rseq size, the required alignment is the + * original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. + */ + if (rseq_len < ORIG_RSEQ_SIZE) + return false; + + if (rseq_len == ORIG_RSEQ_SIZE) + return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); + + return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && + rseq_len >= offsetof(struct rseq, end); +} + +#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + +/* + * sys_rseq - Register or unregister restartable sequences for the caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +{ + if (flags & RSEQ_FLAG_UNREGISTER) + return rseq_unregister(rseq, rseq_len, flags, sig); + + if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) + return -EINVAL; + + if (current->rseq.usrptr) + return rseq_reregister(rseq, rseq_len, sig); + + if (!rseq_length_valid(rseq, rseq_len)) + return -EINVAL; + + return rseq_register(rseq, rseq_len, flags, sig); +} + #ifdef CONFIG_RSEQ_SLICE_EXTENSION struct slice_timer { struct hrtimer timer; @@ -713,6 +761,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) return -ENOTSUPP; if (!current->rseq.usrptr) return -ENXIO; + if (!rseq_v2(current)) + return -ENOTSUPP; /* No change? */ if (enable == !!current->rseq.slice.state.enabled) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da20fb6ea25a..b8871449d3c6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4458,6 +4458,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; + p->se.rel_deadline = 0; INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e426e27b6794..38d90baf78cf 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = { .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), .head_offset = offsetof(struct scx_sched, hash_node), + .insecure_elasticity = true, /* inserted under scx_sched_lock */ }; static struct rhashtable scx_sched_hash; @@ -52,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); static DEFINE_RAW_SPINLOCK(scx_bypass_lock); -static cpumask_var_t scx_bypass_lb_donee_cpumask; -static cpumask_var_t scx_bypass_lb_resched_cpumask; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -469,24 +468,35 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, op, rq, args...) \ +/* + * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not + * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. + */ +#define SCX_CALL_OP(sch, op, locked_rq, args...) \ do { \ - if (rq) \ - update_locked_rq(rq); \ + struct rq *__prev_locked_rq; \ + \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ } while (0) -#define SCX_CALL_OP_RET(sch, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ + struct rq *__prev_locked_rq; \ __typeof__((sch)->ops.op(args)) __ret; \ \ - if (rq) \ - update_locked_rq(rq); \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ __ret = (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ __ret; \ }) @@ -498,39 +508,39 @@ do { \ * those subject tasks. * * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - - * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock - * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if - * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's + * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. + * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. * * kf_tasks[] can not stack, so task-based SCX ops must not nest. The * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ do { \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -756,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); return; } #endif @@ -856,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) iter->css_pos = css_next_descendant_pre(iter->css_pos, &iter->cgrp->self); if (iter->css_pos) - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); } return NULL; } @@ -916,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * * Test for idle_sched_class as only init_tasks are on it. */ - if (p->sched_class != &idle_sched_class) - break; - } - if (!p) - return NULL; + if (p->sched_class == &idle_sched_class) + continue; - iter->rq = task_rq_lock(p, &iter->rf); - iter->locked_task = p; + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; - return p; + /* + * cgroup_task_dead() removes the dead tasks from cset->tasks + * after sched_ext_dead() and cgroup iteration may see tasks + * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS + * is set by sched_ext_dead() under @p's rq lock. Test it to + * avoid visiting tasks which are already dead from SCX POV. + */ + if (p->scx.flags & SCX_TASK_OFF_TASKS) { + __scx_task_iter_rq_unlock(iter); + continue; + } + + return p; + } + return NULL; } /** @@ -1388,18 +1411,55 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, p->scx.flags &= ~SCX_TASK_IN_CUSTODY; } -static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - call_task_dequeue(scx_root, rq, p, 0); + call_task_dequeue(sch, rq, p, 0); + + /* + * Note that @rq's lock may be dropped between this enqueue and @p + * actually getting on CPU. This gives higher-class tasks (e.g. RT) + * an opportunity to wake up on @rq and prevent @p from running. + * Here are some concrete examples: + * + * Example 1: + * + * We dispatch two tasks from a single ops.dispatch(): + * - First, a local task to this CPU's local DSQ; + * - Second, a local/remote task to a remote CPU's local DSQ. + * We must drop the local rq lock in order to finish the second + * dispatch. In that time, an RT task can wake up on the local rq. + * + * Example 2: + * + * We dispatch a local/remote task to a remote CPU's local DSQ. + * We must drop the remote rq lock before the dispatched task can run, + * which gives an RT task an opportunity to wake up on the remote rq. + * + * Both examples work the same if we replace dispatching with moving + * the tasks from a user-created DSQ. + * + * We must detect these wakeups so that we can re-enqueue IMMED tasks + * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this + * purpose, but for it to be invoked, we must ensure that we bump + * @rq->next_class to &ext_sched_class if it's currently idle. + * + * wakeup_preempt() does the bumping, and since we only invoke it if + * @rq->next_class is below &ext_sched_class, it will also + * resched_curr(rq). + */ + if (sched_class_above(p->sched_class, rq->next_class)) + wakeup_preempt(rq, p, 0); /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving * @p into its local DSQ. + * Note that the wakeup_preempt() above may have already triggered + * a resched if @rq->next_class was idle. It's harmless, since + * need_resched is cleared immediately after task pick. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) return; @@ -1407,11 +1467,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && rq->curr->sched_class == &ext_sched_class) { rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) resched_curr(rq); + } } static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, @@ -1494,11 +1551,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } else { - bool was_empty; - - was_empty = list_empty(&dsq->list); + /* + * dsq->list can contain parked BPF iterator cursors, so + * list_empty() here isn't a reliable proxy for "no real + * task in the DSQ". Test dsq->first_task directly. + */ list_add_tail(&p->scx.dsq_list.node, &dsq->list); - if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } } @@ -1518,7 +1577,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, * concurrently in a non-atomic way. */ if (is_local) { - local_dsq_post_enq(dsq, p, enq_flags); + local_dsq_post_enq(sch, dsq, p, enq_flags); } else { /* * Task on global/bypass DSQ: leave custody, task on @@ -2129,7 +2188,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl schedule_reenq_local(rq, 0); } -static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, +static void move_local_task_to_local_dsq(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) { @@ -2149,7 +2209,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; - local_dsq_post_enq(dst_dsq, p, enq_flags); + local_dsq_post_enq(sch, dst_dsq, p, enq_flags); } /** @@ -2370,7 +2430,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, /* @p is going from a non-local DSQ to a local DSQ */ if (src_rq == dst_rq) { task_unlink_from_dsq(p, src_dsq); - move_local_task_to_local_dsq(p, enq_flags, + move_local_task_to_local_dsq(sch, p, enq_flags, src_dsq, dst_rq); raw_spin_unlock(&src_dsq->lock); } else { @@ -2423,7 +2483,7 @@ retry: if (rq == task_rq) { task_unlink_from_dsq(p, dsq); - move_local_task_to_local_dsq(p, enq_flags, dsq, rq); + move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); raw_spin_unlock(&dsq->lock); return true; } @@ -3183,7 +3243,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && !scx_bypassing(sch_a, task_cpu(a))) return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, - NULL, + task_rq(a), (struct task_struct *)a, (struct task_struct *)b); else @@ -3631,6 +3691,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch, SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); } +/* + * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never + * ran. The task state has not been transitioned, so this mirrors the + * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). + */ +static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) +{ + struct scx_exit_task_args args = { .cancelled = true }; + + lockdep_assert_held(&p->pi_lock); + lockdep_assert_rq_held(task_rq(p)); + + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); +} + static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { @@ -3639,11 +3715,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, /* * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and - * its parent. Disable and exit for the child too. + * its parent. Exit for the child too - scx_enable_task() never ran for + * it, so undo only init_task. */ - if ((p->scx.flags & SCX_TASK_SUB_INIT) && - !WARN_ON_ONCE(!scx_enabling_sub_sched)) { - __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + if (p->scx.flags & SCX_TASK_SUB_INIT) { + if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) + scx_sub_init_cancel_task(scx_enabling_sub_sched, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } @@ -3784,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p) /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. + * + * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup + * iteration is only used from sub-sched paths, which require root + * enabled. Root enable transitions every live task to at least READY. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; @@ -3791,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); + p->scx.flags |= SCX_TASK_OFF_TASKS; task_rq_unlock(rq, p, &rf); } } @@ -4324,9 +4407,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) void scx_group_set_weight(struct task_group *tg, unsigned long weight) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4339,9 +4423,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); @@ -4355,9 +4440,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle) void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4380,21 +4466,6 @@ static struct cgroup *root_cgroup(void) return &cgrp_dfl_root.cgrp; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) -{ - return sch->cgrp; -} - -/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) -{ - struct cgroup *pos; - struct cgroup_subsys_state *css; - - cgroup_for_each_live_descendant_pre(pos, css, cgrp) - rcu_assign_pointer(pos->scx_sched, sch); -} - static void scx_cgroup_lock(void) { #ifdef CONFIG_EXT_GROUP_SCHED @@ -4412,12 +4483,30 @@ static void scx_cgroup_unlock(void) } #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ static struct cgroup *root_cgroup(void) { return NULL; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +#ifdef CONFIG_EXT_SUB_SCHED +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} +#endif /* CONFIG_EXT_SUB_SCHED */ + /* * Omitted operations: * @@ -4712,6 +4801,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->disable_irq_work); kthread_destroy_worker(sch->helper); timer_shutdown_sync(&sch->bypass_lb_timer); + free_cpumask_var(sch->bypass_lb_donee_cpumask); + free_cpumask_var(sch->bypass_lb_resched_cpumask); #ifdef CONFIG_EXT_SUB_SCHED kfree(sch->cgrp_path); @@ -4938,6 +5029,25 @@ void scx_softlockup(u32 dur_s) smp_processor_id(), dur_s); } +/* + * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), + * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing + * it from NMI context can lead to deadlocks. Defer via irq_work; the + * disable path runs off irq_work anyway. + */ +static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); + +static void scx_hardlockup_irq_workfn(struct irq_work *work) +{ + int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); + + if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); +} + +static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); + /** * scx_hardlockup - sched_ext hardlockup handler * @@ -4946,17 +5056,19 @@ void scx_softlockup(u32 dur_s) * Try kicking out the current scheduler in an attempt to recover the system to * a good state before taking more drastic actions. * - * Returns %true if sched_ext is enabled and abort was initiated, which may - * resolve the reported hardlockup. %false if sched_ext is not enabled or - * someone else already initiated abort. + * Queues an irq_work; the handle_lockup() call happens in IRQ context (see + * scx_hardlockup_irq_workfn). + * + * Returns %true if sched_ext is enabled and the work was queued, %false + * otherwise. */ bool scx_hardlockup(int cpu) { - if (!handle_lockup("hard lockup - CPU %d", cpu)) + if (!rcu_access_pointer(scx_root)) return false; - printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", - cpu); + atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); + irq_work_queue(&scx_hardlockup_irq_work); return true; } @@ -5000,6 +5112,15 @@ resume: if (cpumask_empty(donee_mask)) break; + /* + * If an earlier pass placed @p on @donor_dsq from a different + * CPU and the donee hasn't consumed it yet, @p is still on the + * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved + * without its rq locked. Skip. + */ + if (task_rq(p) != donor_rq) + continue; + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); if (donee >= nr_cpu_ids) continue; @@ -5058,8 +5179,8 @@ resume: static void bypass_lb_node(struct scx_sched *sch, int node) { const struct cpumask *node_mask = cpumask_of_node(node); - struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; - struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; + struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; u32 nr_target, nr_donor_target; u32 before_min = U32_MAX, before_max = 0; @@ -5698,6 +5819,8 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); + if (sch->sub_kset) + kset_unregister(sch->sub_kset); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5829,6 +5952,10 @@ static void scx_root_disable(struct scx_sched *sch) * could observe an object of the same name still in the hierarchy when * the next scheduler is loaded. */ +#ifdef CONFIG_EXT_SUB_SCHED + if (sch->sub_kset) + kset_unregister(sch->sub_kset); +#endif kobject_del(&sch->kobj); free_kick_syncs(); @@ -5921,6 +6048,25 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) irq_work_queue(&sch->disable_irq_work); } +/** + * scx_flush_disable_work - flush the disable work and wait for it to finish + * @sch: the scheduler + * + * sch->disable_work might still not queued, causing kthread_flush_work() + * as a noop. Syncing the irq_work first is required to guarantee the + * kthread work has been queued before waiting for it. + */ +static void scx_flush_disable_work(struct scx_sched *sch) +{ + int kind; + + do { + irq_work_sync(&sch->disable_irq_work); + kthread_flush_work(&sch->disable_work); + kind = atomic_read(&sch->exit_kind); + } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); +} + static void dump_newline(struct seq_buf *s) { trace_sched_ext_dump(""); @@ -6032,9 +6178,8 @@ static void ops_dump_exit(void) scx_dump_data.cpu = -1; } -static void scx_dump_task(struct scx_sched *sch, - struct seq_buf *s, struct scx_dump_ctx *dctx, - struct task_struct *p, char marker) +static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, + struct rq *rq, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; struct scx_sched *task_sch = scx_task_sched(p); @@ -6075,7 +6220,7 @@ static void scx_dump_task(struct scx_sched *sch, if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(sch, dump_task, NULL, dctx, p); + SCX_CALL_OP(sch, dump_task, rq, dctx, p); ops_dump_exit(); } @@ -6199,8 +6344,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, dump_cpu, NULL, - &dctx, cpu, idle); + SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); ops_dump_exit(); } @@ -6223,11 +6367,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, if (rq->curr->sched_class == &ext_sched_class && (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) - scx_dump_task(sch, &s, &dctx, rq->curr, '*'); + scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) if (dump_all_tasks || scx_task_on_sched(sch, p)) - scx_dump_task(sch, &s, &dctx, p, ' '); + scx_dump_task(sch, &s, &dctx, rq, p, ' '); next: rq_unlock_irqrestore(rq, &rf); } @@ -6437,6 +6581,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); + + if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_stop_helper; + } + if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_lb_cpumask; + } sch->ops = *ops; rcu_assign_pointer(ops->priv, sch); @@ -6446,14 +6599,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, char *buf = kzalloc(PATH_MAX, GFP_KERNEL); if (!buf) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } cgroup_path(cgrp, buf, PATH_MAX); sch->cgrp_path = kstrdup(buf, GFP_KERNEL); kfree(buf); if (!sch->cgrp_path) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } sch->cgrp = cgrp; @@ -6488,10 +6641,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #endif /* CONFIG_EXT_SUB_SCHED */ return sch; -#ifdef CONFIG_EXT_SUB_SCHED +err_free_lb_resched: + free_cpumask_var(sch->bypass_lb_resched_cpumask); +err_free_lb_cpumask: + free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: kthread_destroy_worker(sch->helper); -#endif err_free_pcpu: for_each_possible_cpu(cpu) { if (cpu == bypass_fail_cpu) @@ -6510,7 +6665,7 @@ err_free_ei: err_free_sch: kfree(sch); err_put_cgrp: -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_put(cgrp); #endif return ERR_PTR(ret); @@ -6601,7 +6756,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_get(cgrp); #endif sch = scx_alloc_and_add_sched(ops, cgrp, NULL); @@ -6639,8 +6794,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) rcu_assign_pointer(scx_root, sch); ret = scx_link_sched(sch); - if (ret) + if (ret) { + cpus_read_unlock(); goto err_disable; + } scx_idle_enable(ops); @@ -6821,7 +6978,7 @@ err_disable: * completion. sch's base reference will be put by bpf_scx_unreg(). */ scx_error(sch, "scx_root_enable() failed (%d)", ret); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7072,23 +7229,30 @@ out_unlock: abort: put_task_struct(p); scx_task_iter_stop(&sti); - scx_enabling_sub_sched = NULL; + /* + * Undo __scx_init_task() for tasks we marked. scx_enable_task() never + * ran for @sch on them, so calling scx_disable_task() here would invoke + * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched + * must stay set until SUB_INIT is cleared from every marked task - + * scx_disable_and_exit_task() reads it when a task exits concurrently. + */ scx_task_iter_start(&sti, sch->cgrp); while ((p = scx_task_iter_next_locked(&sti))) { if (p->scx.flags & SCX_TASK_SUB_INIT) { - __scx_disable_and_exit_task(sch, p); + scx_sub_init_cancel_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } } scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; err_unlock_and_disable: /* we'll soon enter disable path, keep bypass on */ scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); err_disable: mutex_unlock(&scx_enable_mutex); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7349,7 +7513,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link) struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); scx_disable(sch, SCX_EXIT_UNREG); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); } @@ -8033,12 +8197,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; - struct scx_sched *sch = src_dsq->sched; + struct scx_sched *sch; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; + /* + * The verifier considers an iterator slot initialized on any + * KF_ITER_NEW return, so a BPF program may legally reach here after + * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. + */ + if (unlikely(!src_dsq)) + return false; + + sch = src_dsq->sched; + if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) return false; @@ -8526,7 +8700,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.slice = slice; @@ -8549,7 +8723,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.dsq_vtime = vtime; @@ -8633,11 +8807,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the number of tasks in the DSQ matching @dsq_id. If not found, * -%ENOENT is returned. */ -__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dispatch_q *dsq; @@ -8645,7 +8820,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) preempt_disable(); - sch = rcu_dereference_sched(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) { ret = -ENODEV; goto out; @@ -8677,21 +8852,21 @@ out: /** * scx_bpf_destroy_dsq - Destroy a custom DSQ * @dsq_id: DSQ to destroy + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is * empty and no further tasks are dispatched to it. Ignored if called on a DSQ * which doesn't exist. Can be called from any online scx_ops operations. */ -__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; - rcu_read_lock(); - sch = rcu_dereference(scx_root); + guard(rcu)(); + sch = scx_prog_sched(aux); if (sch) destroy_dsq(sch, dsq_id); - rcu_read_unlock(); } /** @@ -9445,8 +9620,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) @@ -9479,6 +9654,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_any, + .filter = scx_kfunc_context_filter, }; /* @@ -9526,13 +9702,12 @@ static const u32 scx_kf_allow_flags[] = { }; /* - * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the - * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this - * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or + * Verifier-time filter for SCX kfuncs. Registered via the .filter field on + * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc + * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the - * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g. - * scx_kfunc_ids_any) by falling through to "allow" when none of the - * context-sensitive sets contain the kfunc. + * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by + * falling through to "allow" when none of the SCX sets contain the kfunc. */ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) { @@ -9541,18 +9716,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); + bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); + bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); u32 moff, flags; - /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */ - if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release)) + /* Not an SCX kfunc - allow. */ + if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || + in_cpu_release || in_idle || in_any)) return 0; /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ if (prog->type == BPF_PROG_TYPE_SYSCALL) - return (in_unlocked || in_select_cpu) ? 0 : -EACCES; + return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) - return -EACCES; + return (in_any || in_idle) ? 0 : -EACCES; /* * add_subprog_and_kfunc() collects all kfunc calls, including dead code @@ -9565,14 +9743,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) return 0; /* - * Non-SCX struct_ops: only unlocked kfuncs are safe. The other - * context-sensitive kfuncs assume the rq lock is held by the SCX - * dispatch path, which doesn't apply to other struct_ops users. + * Non-SCX struct_ops: SCX kfuncs are not permitted. */ if (prog->aux->st_ops != &bpf_sched_ext_ops) - return in_unlocked ? 0 : -EACCES; + return -EACCES; /* SCX struct_ops: check the per-op allow list. */ + if (in_any || in_idle) + return 0; + moff = prog->aux->attach_st_ops_member_off; flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; @@ -9656,12 +9835,6 @@ static int __init scx_init(void) return ret; } - if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || - !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { - pr_err("sched_ext: Failed to allocate cpumasks\n"); - return -ENOMEM; - } - return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 443d12a3df67..6e1980763270 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -466,12 +466,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, preempt_disable(); /* - * Check whether @prev_cpu is still within the allowed set. If not, - * we can still try selecting a nearby CPU. - */ - is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); - - /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ if (allowed != p->cpus_ptr) { @@ -488,6 +482,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, } /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* * This is necessary to protect llc_cpus. */ rcu_read_lock(); @@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq * lock or @p's pi_lock. Three cases: * - * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock. + * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up + * task's pi_lock; the wake-up task is recorded in kf_tasks[0] + * by SCX_CALL_OP_TASK_RET(). * - other rq-locked SCX op: scx_locked_rq() points at the held rq. * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops): * nothing held, take pi_lock ourselves. + * + * In the first two cases, BPF schedulers may pass an arbitrary task + * that the held lock doesn't cover. Refuse those. */ if (this_rq()->scx.in_select_cpu) { + if (!scx_kf_arg_task_ok(sch, p)) + return -EINVAL; lockdep_assert_held(&p->pi_lock); - } else if (!scx_locked_rq()) { + } else if (scx_locked_rq()) { + if (task_rq(p) != scx_locked_rq()) + goto cross_task; + } else { raw_spin_lock_irqsave(&p->pi_lock, irq_flags); we_locked = true; } @@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); return cpu; + +cross_task: + scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]", + p->comm, p->pid); + return -EINVAL; } /** @@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_idle, + .filter = scx_kfunc_context_filter, }; /* diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index dc35f850481e..8d169d3bbdf9 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,6 +12,7 @@ struct sched_ext_ops; +extern struct btf_id_set8 scx_kfunc_ids_idle; extern struct btf_id_set8 scx_kfunc_ids_select_cpu; void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 62ce4eaf6a3f..a075732d4430 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1075,6 +1075,8 @@ struct scx_sched { struct irq_work disable_irq_work; struct kthread_work disable_work; struct timer_list bypass_lb_timer; + cpumask_var_t bypass_lb_donee_cpumask; + cpumask_var_t bypass_lb_resched_cpumask; struct rcu_work rcu_work; /* all ancestors including self */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69361c63353a..3ebec186f982 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -847,13 +847,19 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO * is set. * - * Return true if the lag has been adjusted. + * Return true if the vlag has been modified. Specifically: + * + * se->vlag != avg_vruntime() - se->vruntime + * + * This can be due to clamping in entity_lag() or clamping due to + * sched_delayed. Either way, when vlag is modified and the entity is + * retained, the tree needs to be adjusted. */ static __always_inline bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); - bool ret; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag = entity_lag(cfs_rq, se, avruntime); WARN_ON_ONCE(!se->on_rq); @@ -863,10 +869,9 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) if (sched_feat(DELAY_ZERO)) vlag = min(vlag, 0); } - ret = (vlag == se->vlag); se->vlag = vlag; - return ret; + return avruntime - vlag != se->vruntime; } /* @@ -877,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> V >= v_i * - * \Sum (v_i - v)*w_i - * V = ------------------ + v + * \Sum (v_i - v0)*w_i + * V = ------------------- + v0 * \Sum w_i * - * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. @@ -889,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; + s64 key, avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -899,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; + key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); + + /* + * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD' + * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24 + * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise. + * + * This gives that on 64bit the product will be at least 64bit which + * overflows s64, while on 32bit it will only be 44bits and should fit + * comfortably. + */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_SUPPORTS_INT128 + /* This often results in simpler code than __builtin_mul_overflow(). */ + return avg >= (__int128)key * load; +#else + s64 rhs; + /* + * On overflow, the sign of key tells us the correct answer: a large + * positive key means vruntime >> V, so not eligible; a large negative + * key means vruntime << V, so eligible. + */ + if (check_mul_overflow(key, load, &rhs)) + return key <= 0; + + return avg >= rhs; +#endif +#else /* 32bit */ + return avg >= key * load; +#endif } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1099,7 +1133,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -1170,11 +1204,6 @@ found: return best; } -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - return __pick_eevdf(cfs_rq, true); -} - struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -5749,11 +5778,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect) { struct sched_entity *se; - se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq, protect); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* @@ -9027,7 +9056,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; - struct sched_entity *se = &donor->se, *pse = &p->se; + struct sched_entity *nse, *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; @@ -9138,11 +9167,18 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f } pick: + nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT); + /* If @p has become the most eligible task, force preemption */ + if (nse == pse) + goto preempt; + /* - * If @p has become the most eligible task, force preemption. + * Because p is enqueued, nse being null can only mean that we + * dequeued a delayed task. If there are still entities queued in + * cfs, check if the next one will be p. */ - if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) - goto preempt; + if (!nse && cfs_rq->nr_queued) + goto pick; if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); @@ -9179,7 +9215,7 @@ again: throttled |= check_cfs_rq_runtime(cfs_rq); - se = pick_next_entity(rq, cfs_rq); + se = pick_next_entity(rq, cfs_rq, true); if (!se) goto again; cfs_rq = group_cfs_rq(se); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 623445603725..226a6329f3e9 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,16 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_sched_switch_event(current); + /* + * Legacy mode requires that IDs are written and the critical section is + * evaluated. V2 optimized mode handles the critical section and IDs are + * only updated if they change as a consequence of preemption after + * return from this IPI. + */ + if (rseq_v2(current)) + rseq_sched_switch_event(current); + else + rseq_force_update(); } static void ipi_sync_rq_state(void *info) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index b4d730604972..5e22697b098d 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,6 +94,9 @@ static int __clockevents_switch_state(struct clock_event_device *dev, if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; + /* On state transitions clear the forced flag unconditionally */ + dev->next_event_forced = 0; + /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: @@ -366,8 +369,10 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, b if (delta > (int64_t)dev->min_delta_ns) { delta = min(delta, (int64_t) dev->max_delta_ns); cycles = ((u64)delta * dev->mult) >> dev->shift; - if (!dev->set_next_event((unsigned long) cycles, dev)) + if (!dev->set_next_event((unsigned long) cycles, dev)) { + dev->next_event_forced = 0; return 0; + } } if (dev->next_event_forced) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7e57fa31ee26..115e0bf01276 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -108,6 +108,7 @@ static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) { + wd->next_event_forced = 0; /* * If we woke up early and the tick was reprogrammed in the * meantime then this may be spurious but harmless. diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 155eeaea4113..1d0d3a4058d5 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1860,19 +1860,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, * child to the new parents. So tmigr_active_up() activates the * new parents while walking up from the old root to the new. * - * * It is ensured that @start is active, as this setup path is - * executed in hotplug prepare callback. This is executed by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. + * * It is ensured that @start is active, (or on the way to be activated + * by another CPU that woke up before the current one) as this setup path + * is executed in hotplug prepare callback. This is executed by an already + * connected and !idle CPU in the hierarchy. + * + * * The below RmW atomic operation ensures that: + * + * 1) If the old root has been completely activated, the latest state is + * acquired (the below implicit acquire pairs with the implicit release + * from cmpxchg() in tmigr_active_up()). + * + * 2) If the old root is still on the way to be activated, the lagging behind + * CPU performing the activation will acquire the links up to the new root. + * (The below implicit release pairs with the implicit acquire from cmpxchg() + * in tmigr_active_up()). + * + * 3) Every subsequent CPU below the old root will acquire the new links while + * walking through the old root (The below implicit release pairs with the + * implicit acquire from cmpxchg() in either tmigr_active_up()) or + * tmigr_inactive_up(). */ - state.state = atomic_read(&start->migr_state); - WARN_ON_ONCE(!state.active); + state.state = atomic_fetch_or(0, &start->migr_state); WARN_ON_ONCE(!start->parent); - data.childmask = start->groupmask; - __walk_groups_from(tmigr_active_up, &data, start, start->parent); + /* + * If the state of the old root is inactive, another CPU is on its way to activate + * it and propagate to the new root. + */ + if (state.active) { + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } } /* Root update */ diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 4d4229e5eec4..1decdce8cbef 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -133,41 +133,14 @@ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o -# # simple_ring_buffer is used by the pKVM hypervisor which does not have access # to all kernel symbols. Fail the build if forbidden symbols are found. -# -# undefsyms_base generates a set of compiler and tooling-generated symbols that can -# safely be ignored for simple_ring_buffer. -# -filechk_undefsyms_base = \ - echo '$(pound)include <linux/atomic.h>'; \ - echo '$(pound)include <linux/string.h>'; \ - echo '$(pound)include <asm/page.h>'; \ - echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \ - echo 'void undefsyms_base(void *p, int n);'; \ - echo 'void undefsyms_base(void *p, int n) {'; \ - echo ' char buffer[256] = { 0 };'; \ - echo ' u32 u = 0;'; \ - echo ' memset((char * volatile)page, 8, PAGE_SIZE);'; \ - echo ' memset((char * volatile)buffer, 8, sizeof(buffer));'; \ - echo ' memcpy((void * volatile)p, buffer, sizeof(buffer));'; \ - echo ' cmpxchg((u32 * volatile)&u, 0, 8);'; \ - echo ' WARN_ON(n == 0xdeadbeef);'; \ - echo '}' - -$(obj)/undefsyms_base.c: FORCE - $(call filechk,undefsyms_base) - -clean-files += undefsyms_base.c - -$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c +# Basic compiler and tooling-generated symbols that can safely be left +# undefined. Ensure KASAN is enabled to avoid logic that may disable +# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not +# automatically get KASAN flags because it is not linked into vmlinux. targets += undefsyms_base.o - -# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when -# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags -# because it is not linked into vmlinux. KASAN_SANITIZE_undefsyms_base.o := y UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 56d145017902..cc49ebd2a773 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -4,6 +4,7 @@ */ #define pr_fmt(fmt) "fprobe: " fmt +#include <linux/cleanup.h> #include <linux/err.h> #include <linux/fprobe.h> #include <linux/kallsyms.h> @@ -78,36 +79,33 @@ static const struct rhashtable_params fprobe_rht_params = { }; /* Node insertion and deletion requires the fprobe_mutex */ -static int insert_fprobe_node(struct fprobe_hlist_node *node) +static int __insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) { + int ret; + lockdep_assert_held(&fprobe_mutex); - return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); + ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); + /* Set the fprobe pointer if insertion was successful. */ + if (!ret) + WRITE_ONCE(node->fp, fp); + return ret; } -/* Return true if there are synonims */ -static bool delete_fprobe_node(struct fprobe_hlist_node *node) +static void __delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); - bool ret; - /* Avoid double deleting */ + /* Avoid double deleting and non-inserted nodes */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); rhltable_remove(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } - - rcu_read_lock(); - ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, - fprobe_rht_params); - rcu_read_unlock(); - - return ret; } /* Check existence of the fprobe */ -static bool is_fprobe_still_exist(struct fprobe *fp) +static bool fprobe_registered(struct fprobe *fp) { struct hlist_head *head; struct fprobe_hlist *fph; @@ -120,7 +118,7 @@ static bool is_fprobe_still_exist(struct fprobe *fp) } return false; } -NOKPROBE_SYMBOL(is_fprobe_still_exist); +NOKPROBE_SYMBOL(fprobe_registered); static int add_fprobe_hash(struct fprobe *fp) { @@ -132,9 +130,6 @@ static int add_fprobe_hash(struct fprobe *fp) if (WARN_ON_ONCE(!fph)) return -EINVAL; - if (is_fprobe_still_exist(fp)) - return -EEXIST; - head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; hlist_add_head_rcu(&fp->hlist_array->hlist, head); return 0; @@ -149,7 +144,7 @@ static int del_fprobe_hash(struct fprobe *fp) if (WARN_ON_ONCE(!fph)) return -EINVAL; - if (!is_fprobe_still_exist(fp)) + if (!fprobe_registered(fp)) return -ENOENT; fph->fp = NULL; @@ -255,7 +250,65 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent return ret; } +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs); + +static struct fgraph_ops fprobe_graph_ops = { + .entryfunc = fprobe_fgraph_entry, + .retfunc = fprobe_return, +}; +/* Number of fgraph fprobe nodes */ +static int nr_fgraph_fprobes; +/* Is fprobe_graph_ops registered? */ +static bool fprobe_graph_registered; + +/* Add @addrs to the ftrace filter and register fgraph if needed. */ +static int fprobe_graph_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_graph_registered) { + ret = register_ftrace_graph(&fprobe_graph_ops); + if (WARN_ON_ONCE(ret)) { + ftrace_free_filter(&fprobe_graph_ops.ops); + return ret; + } + fprobe_graph_registered = true; + } + return 0; +} + +static void __fprobe_graph_unregister(void) +{ + if (fprobe_graph_registered) { + unregister_ftrace_graph(&fprobe_graph_ops); + ftrace_free_filter(&fprobe_graph_ops.ops); + fprobe_graph_registered = false; + } +} + +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ +static void fprobe_graph_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); +} + #if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) + /* ftrace_ops callback, this processes fprobes which have only entry_handler. */ static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) @@ -298,7 +351,10 @@ static struct ftrace_ops fprobe_ftrace_ops = { .func = fprobe_ftrace_entry, .flags = FTRACE_OPS_FL_SAVE_ARGS, }; -static int fprobe_ftrace_active; +/* Number of ftrace fprobe nodes */ +static int nr_ftrace_fprobes; +/* Is fprobe_ftrace_ops registered? */ +static bool fprobe_ftrace_registered; static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) { @@ -310,25 +366,33 @@ static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) if (ret) return ret; - if (!fprobe_ftrace_active) { + if (!fprobe_ftrace_registered) { ret = register_ftrace_function(&fprobe_ftrace_ops); if (ret) { ftrace_free_filter(&fprobe_ftrace_ops); return ret; } + fprobe_ftrace_registered = true; } - fprobe_ftrace_active++; return 0; } +static void __fprobe_ftrace_unregister(void) +{ + if (fprobe_ftrace_registered) { + unregister_ftrace_function(&fprobe_ftrace_ops); + ftrace_free_filter(&fprobe_ftrace_ops); + fprobe_ftrace_registered = false; + } +} + static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) { lockdep_assert_held(&fprobe_mutex); - fprobe_ftrace_active--; - if (!fprobe_ftrace_active) - unregister_ftrace_function(&fprobe_ftrace_ops); - if (num) + if (!nr_ftrace_fprobes) + __fprobe_ftrace_unregister(); + else if (num) ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0); } @@ -337,12 +401,78 @@ static bool fprobe_is_ftrace(struct fprobe *fp) return !fp->exit_handler; } +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = __insert_fprobe_node(node, fp); + if (!ret) { + if (fprobe_is_ftrace(fp)) + nr_ftrace_fprobes++; + else + nr_fgraph_fprobes++; + } + + return ret; +} + +static void delete_fprobe_node(struct fprobe_hlist_node *node) +{ + struct fprobe *fp; + + lockdep_assert_held(&fprobe_mutex); + + fp = READ_ONCE(node->fp); + if (fp) { + if (fprobe_is_ftrace(fp)) + nr_ftrace_fprobes--; + else + nr_fgraph_fprobes--; + } + __delete_fprobe_node(node); +} + +static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace) +{ + struct rhlist_head *head, *pos; + struct fprobe_hlist_node *node; + struct fprobe *fp; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &ip, + fprobe_rht_params); + if (!head) + return false; + /* We have to check the same type on the list. */ + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (likely(fp)) { + if ((!ftrace && fp->exit_handler) || + (ftrace && !fp->exit_handler)) + return true; + } + } + + return false; +} + #ifdef CONFIG_MODULES -static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, - int reset) +static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt) { - ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); - ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset); + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0); + + if (!nr_ftrace_fprobes) + __fprobe_ftrace_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0); } #endif #else @@ -360,11 +490,62 @@ static bool fprobe_is_ftrace(struct fprobe *fp) return false; } +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = __insert_fprobe_node(node, fp); + if (!ret) + nr_fgraph_fprobes++; + + return ret; +} + +static void delete_fprobe_node(struct fprobe_hlist_node *node) +{ + struct fprobe *fp; + + lockdep_assert_held(&fprobe_mutex); + + fp = READ_ONCE(node->fp); + if (fp) + nr_fgraph_fprobes--; + __delete_fprobe_node(node); +} + +static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused) +{ + struct rhlist_head *head, *pos; + struct fprobe_hlist_node *node; + struct fprobe *fp; + + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &ip, + fprobe_rht_params); + if (!head) + return false; + /* We only need to check fp is there. */ + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (likely(fp)) + return true; + } + + return false; +} + #ifdef CONFIG_MODULES -static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, - int reset) +static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt) { - ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); + if (!nr_fgraph_fprobes) + __fprobe_graph_unregister(); + else if (cnt) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0); } #endif #endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */ @@ -480,7 +661,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, if (!fp) break; curr += FPROBE_HEADER_SIZE_IN_LONG; - if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) { + if (fprobe_registered(fp) && !fprobe_disabled(fp)) { if (WARN_ON_ONCE(curr + size > size_words)) break; fp->exit_handler(fp, trace->func, ret_ip, fregs, @@ -492,51 +673,9 @@ static void fprobe_return(struct ftrace_graph_ret *trace, } NOKPROBE_SYMBOL(fprobe_return); -static struct fgraph_ops fprobe_graph_ops = { - .entryfunc = fprobe_fgraph_entry, - .retfunc = fprobe_return, -}; -static int fprobe_graph_active; - -/* Add @addrs to the ftrace filter and register fgraph if needed. */ -static int fprobe_graph_add_ips(unsigned long *addrs, int num) -{ - int ret; - - lockdep_assert_held(&fprobe_mutex); - - ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); - if (ret) - return ret; - - if (!fprobe_graph_active) { - ret = register_ftrace_graph(&fprobe_graph_ops); - if (WARN_ON_ONCE(ret)) { - ftrace_free_filter(&fprobe_graph_ops.ops); - return ret; - } - } - fprobe_graph_active++; - return 0; -} - -/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ -static void fprobe_graph_remove_ips(unsigned long *addrs, int num) -{ - lockdep_assert_held(&fprobe_mutex); - - fprobe_graph_active--; - /* Q: should we unregister it ? */ - if (!fprobe_graph_active) - unregister_ftrace_graph(&fprobe_graph_ops); - - if (num) - ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); -} - #ifdef CONFIG_MODULES -#define FPROBE_IPS_BATCH_INIT 8 +#define FPROBE_IPS_BATCH_INIT 128 /* instruction pointer address list */ struct fprobe_addr_list { int index; @@ -544,43 +683,29 @@ struct fprobe_addr_list { unsigned long *addrs; }; -static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr) +static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - unsigned long *addrs; + lockdep_assert_in_rcu_read_lock(); - /* Previously we failed to expand the list. */ - if (alist->index == alist->size) - return -ENOSPC; - - alist->addrs[alist->index++] = addr; - if (alist->index < alist->size) + if (!within_module(node->addr, mod)) return 0; - /* Expand the address list */ - addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL); - if (!addrs) - return -ENOMEM; - - memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs)); - alist->size *= 2; - kfree(alist->addrs); - alist->addrs = addrs; - - return 0; -} - -static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, - struct fprobe_addr_list *alist) -{ - if (!within_module(node->addr, mod)) - return; - if (delete_fprobe_node(node)) - return; + delete_fprobe_node(node); + /* If no address list is available, we can't track this address. */ + if (!alist->addrs) + return 0; /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. + * Don't care the type here, because all fprobes on the same + * address must be removed eventually. */ - fprobe_addr_list_add(alist, node->addr); + if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) { + alist->addrs[alist->index++] = node->addr; + if (alist->index == alist->size) + return -ENOSPC; + } + + return 0; } /* Handle module unloading to manage fprobe_ip_table. */ @@ -591,29 +716,48 @@ static int fprobe_module_callback(struct notifier_block *nb, struct fprobe_hlist_node *node; struct rhashtable_iter iter; struct module *mod = data; + bool retry; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); - /* If failed to alloc memory, we can not remove ips from hash. */ - if (!alist.addrs) - return NOTIFY_DONE; + /* + * If failed to alloc memory, ftrace_ops will not be able to remove ips from + * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid + * the potential wrong callback. So just print a warning here and try to + * continue without address list. + */ + WARN_ONCE(!alist.addrs, + "Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated"); mutex_lock(&fprobe_mutex); +again: + retry = false; + alist.index = 0; rhltable_walk_enter(&fprobe_ip_table, &iter); do { rhashtable_walk_start(&iter); while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) - fprobe_remove_node_in_module(mod, node, &alist); + if (fprobe_remove_node_in_module(mod, node, &alist) < 0) { + retry = true; + break; + } rhashtable_walk_stop(&iter); - } while (node == ERR_PTR(-EAGAIN)); + } while (node == ERR_PTR(-EAGAIN) && !retry); rhashtable_walk_exit(&iter); + /* Remove any ips from hash table(s) */ + fprobe_remove_ips(alist.addrs, alist.index); + /* + * If we break rhashtable walk loop except for -EAGAIN, we need + * to restart looping from start for safety. Anyway, this is + * not a hotpath. + */ + if (retry) + goto again; - if (alist.index > 0) - fprobe_set_ips(alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); kfree(alist.addrs); @@ -757,7 +901,6 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) fp->hlist_array = hlist_array; hlist_array->fp = fp; for (i = 0; i < num; i++) { - hlist_array->array[i].fp = fp; addr = ftrace_location(addrs[i]); if (!addr) { fprobe_fail_cleanup(fp); @@ -821,6 +964,8 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter } EXPORT_SYMBOL_GPL(register_fprobe); +static int unregister_fprobe_nolock(struct fprobe *fp); + /** * register_fprobe_ips() - Register fprobe to ftrace by address. * @fp: A fprobe data structure to be registered. @@ -839,35 +984,33 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) struct fprobe_hlist *hlist_array; int ret, i; + guard(mutex)(&fprobe_mutex); + if (fprobe_registered(fp)) + return -EEXIST; + ret = fprobe_init(fp, addrs, num); if (ret) return ret; - mutex_lock(&fprobe_mutex); - - hlist_array = fp->hlist_array; if (fprobe_is_ftrace(fp)) ret = fprobe_ftrace_add_ips(addrs, num); else ret = fprobe_graph_add_ips(addrs, num); - - if (!ret) { - add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) { - ret = insert_fprobe_node(&hlist_array->array[i]); - if (ret) - break; - } - /* fallback on insert error */ - if (ret) { - for (i--; i >= 0; i--) - delete_fprobe_node(&hlist_array->array[i]); - } + if (ret) { + fprobe_fail_cleanup(fp); + return ret; } - mutex_unlock(&fprobe_mutex); - if (ret) - fprobe_fail_cleanup(fp); + hlist_array = fp->hlist_array; + ret = add_fprobe_hash(fp); + for (i = 0; i < hlist_array->size && !ret; i++) + ret = insert_fprobe_node(&hlist_array->array[i], fp); + + if (ret) { + unregister_fprobe_nolock(fp); + /* In error case, wait for clean up safely. */ + synchronize_rcu(); + } return ret; } @@ -911,37 +1054,28 @@ bool fprobe_is_registered(struct fprobe *fp) return true; } -/** - * unregister_fprobe() - Unregister fprobe. - * @fp: A fprobe data structure to be unregistered. - * - * Unregister fprobe (and remove ftrace hooks from the function entries). - * - * Return 0 if @fp is unregistered successfully, -errno if not. - */ -int unregister_fprobe(struct fprobe *fp) +static int unregister_fprobe_nolock(struct fprobe *fp) { - struct fprobe_hlist *hlist_array; + struct fprobe_hlist *hlist_array = fp->hlist_array; unsigned long *addrs = NULL; - int ret = 0, i, count; - - mutex_lock(&fprobe_mutex); - if (!fp || !is_fprobe_still_exist(fp)) { - ret = -EINVAL; - goto out; - } + int i, count; - hlist_array = fp->hlist_array; addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL); - if (!addrs) { - ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */ - goto out; - } + /* + * This will remove fprobe_hash_node from the hash table even if + * memory allocation fails. However, ftrace_ops will not be updated. + * Anyway, when the last fprobe is unregistered, ftrace_ops is also + * unregistered. + */ + if (!addrs) + pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n"); /* Remove non-synonim ips from table and hash */ count = 0; for (i = 0; i < hlist_array->size; i++) { - if (!delete_fprobe_node(&hlist_array->array[i])) + delete_fprobe_node(&hlist_array->array[i]); + if (addrs && !fprobe_exists_on_hash(hlist_array->array[i].addr, + fprobe_is_ftrace(fp))) addrs[count++] = hlist_array->array[i].addr; } del_fprobe_hash(fp); @@ -953,12 +1087,26 @@ int unregister_fprobe(struct fprobe *fp) kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; + kfree(addrs); -out: - mutex_unlock(&fprobe_mutex); + return 0; +} - kfree(addrs); - return ret; +/** + * unregister_fprobe() - Unregister fprobe. + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + guard(mutex)(&fprobe_mutex); + if (!fp || !fprobe_registered(fp)) + return -EINVAL; + + return unregister_fprobe_nolock(fp); } EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cef49f8871d2..5326924615a4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1884,7 +1884,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page, *orig_head; + struct buffer_page *head_page, *orig_head, *orig_reader; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; @@ -1895,16 +1895,17 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) return; orig_head = head_page = cpu_buffer->head_page; + orig_reader = cpu_buffer->reader_page; /* Do the reader page first */ - ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; - entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); - local_set(&cpu_buffer->reader_page->entries, ret); + entry_bytes += local_read(&orig_reader->page->commit); + local_set(&orig_reader->entries, ret); ts = head_page->page->time_stamp; @@ -2007,8 +2008,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { - /* Reader page has already been done */ - if (head_page == cpu_buffer->reader_page) + /* The original reader page has already been checked/counted. */ + if (head_page == orig_reader) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6809b370e991..d1564db95a8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&annotated_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "annotated branches stats\n"); - return 1; + return ret; } return 0; } @@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&all_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "all branches stats\n"); - return 1; + return ret; } return 0; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e1c73065dae5..e0d3a0da26af 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1523,6 +1523,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + if (*size > MAX_PROBE_EVENT_SIZE) { + ret = -E2BIG; + trace_probe_log_err(ctx->offset, EVENT_TOO_BIG); + goto fail; + } + if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 9fc56c937130..262d8707a3df 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX +#define MAX_PROBE_EVENT_SIZE 3072 /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_TYPE4STR, "This type does not fit for string."),\ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ C(TOO_MANY_ARGS, "Too many arguments are specified"), \ - C(TOO_MANY_EARGS, "Too many entry arguments specified"), + C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c new file mode 100644 index 000000000000..e65baf58e6ff --- /dev/null +++ b/kernel/trace/undefsyms_base.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * simple_ring_buffer is used by the pKVM hypervisor which does not have access + * to all kernel symbols. Whatever is undefined when compiling this file is + * compiler and tooling-generated symbols that can safely be ignored for + * simple_ring_buffer. + */ + +#include <linux/atomic.h> +#include <linux/string.h> +#include <asm/page.h> + +void undefsyms_base(void *p, int n); + +static char page[PAGE_SIZE] __aligned(PAGE_SIZE); + +void undefsyms_base(void *p, int n) +{ + char buffer[256] = { 0 }; + + u32 u = 0; + memset((char * volatile)page, 8, PAGE_SIZE); + memset((char * volatile)buffer, 8, sizeof(buffer)); + memcpy((void * volatile)p, buffer, sizeof(buffer)); + cmpxchg((u32 * volatile)&u, 0, 8); + WARN_ON(n == 0xdeadbeef); +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..3d2e3b2ec528 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5906,6 +5906,21 @@ err_destroy: return NULL; } +__printf(1, 0) +static struct workqueue_struct *alloc_workqueue_va(const char *fmt, + unsigned int flags, + int max_active, + va_list args) +{ + struct workqueue_struct *wq; + + wq = __alloc_workqueue(fmt, flags, max_active, args); + if (wq) + wq_init_lockdep(wq); + + return wq; +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, @@ -5915,12 +5930,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, va_list args; va_start(args, max_active); - wq = __alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); - if (!wq) - return NULL; - - wq_init_lockdep(wq); return wq; } @@ -5932,15 +5943,15 @@ static void devm_workqueue_release(void *res) } __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...) +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; int ret; va_start(args, max_active); - wq = alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; @@ -5951,7 +5962,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, return wq; } -EXPORT_SYMBOL_GPL(devm_alloc_workqueue); +EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) |
