diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/syscall.c | 25 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 37 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 88 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 29 | ||||
| -rw-r--r-- | kernel/dma/debug.c | 1 | ||||
| -rw-r--r-- | kernel/liveupdate/luo_session.c | 9 | ||||
| -rw-r--r-- | kernel/power/em_netlink.c | 2 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 4 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 144 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 33 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 10 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 3 | ||||
| -rw-r--r-- | kernel/sys.c | 30 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 4 | ||||
| -rw-r--r-- | kernel/workqueue.c | 25 |
15 files changed, 347 insertions, 97 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 274039e36465..700938782bed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3261,6 +3261,18 @@ static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) bpf_link_dealloc(link); } +static bool bpf_link_is_tracepoint(struct bpf_link *link) +{ + /* + * Only these combinations support a tracepoint bpf_link. + * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use + * bpf_raw_tp_link_lops and thus dealloc_deferred(), see + * bpf_raw_tp_link_attach(). + */ + return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT || + (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP); +} + static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) @@ -3279,16 +3291,25 @@ static void bpf_link_free(struct bpf_link *link) if (link->prog) ops->release(link); if (ops->dealloc_deferred) { - /* Schedule BPF link deallocation, which will only then + /* + * Schedule BPF link deallocation, which will only then * trigger putting BPF program refcount. * If underlying BPF program is sleepable or BPF link's target * attach hookpoint is sleepable or otherwise requires RCU GPs * to ensure link and its underlying BPF program is not * reachable anymore, we need to first wait for RCU tasks - * trace sync, and then go through "classic" RCU grace period + * trace sync, and then go through "classic" RCU grace period. + * + * For tracepoint BPF links, we need to go through SRCU grace + * period wait instead when non-faultable tracepoint is used. We + * don't need to chain SRCU grace period waits, however, for the + * faultable case, since it exclusively uses RCU Tasks Trace. */ if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */ + else if (bpf_link_is_tracepoint(link)) + call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); } else if (ops->dealloc) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f108c01ff6d0..e3814152b52f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -617,6 +617,13 @@ static bool is_atomic_load_insn(const struct bpf_insn *insn) insn->imm == BPF_LOAD_ACQ; } +static bool is_atomic_fetch_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm & BPF_FETCH); +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -4447,10 +4454,24 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { - if (!bt_is_reg_set(bt, dreg)) + } else if (class == BPF_LDX || + is_atomic_load_insn(insn) || + is_atomic_fetch_insn(insn)) { + u32 load_reg = dreg; + + /* + * Atomic fetch operation writes the old value into + * a register (sreg or r0) and if it was tracked for + * precision, propagate to the stack slot like we do + * in regular ldx. + */ + if (is_atomic_fetch_insn(insn)) + load_reg = insn->imm == BPF_CMPXCHG ? + BPF_REG_0 : sreg; + + if (!bt_is_reg_set(bt, load_reg)) return 0; - bt_clear_reg(bt, dreg); + bt_clear_reg(bt, load_reg); /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. @@ -7905,7 +7926,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (base_type(reg->type) == PTR_TO_BUF) { + } else if (base_type(reg->type) == PTR_TO_BUF && + !type_may_be_null(reg->type)) { bool rdonly_mem = type_is_rdonly_mem(reg->type); u32 *max_access; @@ -19915,8 +19937,13 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * since someone could have accessed through (ptr - k), or * even done ptr -= k in a register, to get a safe access. */ - if (rold->range > rcur->range) + if (rold->range < 0 || rcur->range < 0) { + /* special case for [BEYOND|AT]_PKT_END */ + if (rold->range != rcur->range) + return false; + } else if (rold->range > rcur->range) { return false; + } /* If the offsets don't match, we can't trust our alignment; * nor can we be sure that we won't fall out of range. */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 01fc2a93f3ef..4ca3cb993da2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); + init_waitqueue_head(&cgrp->dying_populated_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -6224,6 +6225,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; +/** + * cgroup_drain_dying - wait for dying tasks to leave before rmdir + * @cgrp: the cgroup being removed + * + * cgroup.procs and cgroup.threads use css_task_iter which filters out + * PF_EXITING tasks so that userspace doesn't see tasks that have already been + * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the + * cgroup has non-empty css_sets - is only updated when dying tasks pass through + * cgroup_task_dead() in finish_task_switch(). This creates a window where + * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir + * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no + * tasks. + * + * This function aligns cgroup_has_tasks() with what userspace can observe. If + * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are + * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the + * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. + * + * This function only concerns itself with this cgroup's own dying tasks. + * Whether the cgroup has children is cgroup_destroy_locked()'s problem. + * + * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we + * retry the full check from scratch. + * + * Must be called with cgroup_mutex held. + */ +static int cgroup_drain_dying(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct css_task_iter it; + struct task_struct *task; + DEFINE_WAIT(wait); + + lockdep_assert_held(&cgroup_mutex); +retry: + if (!cgroup_has_tasks(cgrp)) + return 0; + + /* Same iterator as cgroup.threads - if any task is visible, it's busy */ + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + + if (task) + return -EBUSY; + + /* + * All remaining tasks are PF_EXITING and will pass through + * cgroup_task_dead() shortly. Wait for a kick and retry. + * + * cgroup_has_tasks() can't transition from false to true while we're + * holding cgroup_mutex, but the true to false transition happens + * under css_set_lock (via cgroup_task_dead()). We must retest and + * prepare_to_wait() under css_set_lock. Otherwise, the transition + * can happen between our first test and prepare_to_wait(), and we + * sleep with no one to wake us. + */ + spin_lock_irq(&css_set_lock); + if (!cgroup_has_tasks(cgrp)) { + spin_unlock_irq(&css_set_lock); + return 0; + } + prepare_to_wait(&cgrp->dying_populated_waitq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&cgrp->dying_populated_waitq, &wait); + mutex_lock(&cgroup_mutex); + goto retry; +} + int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -6233,9 +6306,12 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); + ret = cgroup_drain_dying(cgrp); + if (!ret) { + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); + } cgroup_kn_unlock(kn); return ret; @@ -6995,6 +7071,7 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { + struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7008,6 +7085,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); + /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + if (waitqueue_active(&link->cgrp->dying_populated_waitq)) + wake_up(&link->cgrp->dying_populated_waitq); + if (dl_task(tsk)) dec_dl_tasks_cs(tsk); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d21868455341..1335e437098e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2988,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; - bool cpus_updated, mems_updated; + bool setsched_check; int ret; /* used later by cpuset_attach() */ @@ -3003,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; - cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); - mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + /* + * Skip rights over task setsched check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + setsched_check = !cpuset_v2() || + !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) || + !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * A v1 cpuset with tasks will have no CPU left only when CPU hotplug + * brings the last online CPU offline as users are not allowed to empty + * cpuset.cpus when there are active tasks inside. When that happens, + * we should allow tasks to migrate out without security check to make + * sure they will be able to run after migration. + */ + if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus)) + setsched_check = false; cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - /* - * Skip rights over task check in v2 when nothing changes, - * migration permission derives from hierarchy ownership in - * cgroup_procs_write_permission()). - */ - if (!cpuset_v2() || (cpus_updated || mems_updated)) { + if (setsched_check) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 0677918f06a8..1a725edbbbf6 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -615,6 +615,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !(entry->is_cache_clean && overlap_cache_clean) && + dma_get_cache_alignment() >= L1_CACHE_BYTES && !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index 783677295640..25ae704d7787 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -558,8 +558,13 @@ int luo_session_deserialize(void) } scoped_guard(mutex, &session->mutex) { - luo_file_deserialize(&session->file_set, - &sh->ser[i].file_set_ser); + err = luo_file_deserialize(&session->file_set, + &sh->ser[i].file_set_ser); + } + if (err) { + pr_warn("Failed to deserialize files for session [%s] %pe\n", + session->name, ERR_PTR(err)); + return err; } } diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index 5a611d3950fd..4d4fd29bd2be 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -109,6 +109,8 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, id = nla_get_u32(info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]); pd = em_perf_domain_get_by_id(id); + if (!pd) + return -EINVAL; __em_nl_get_pd_size(pd, &msg_sz); msg = genlmsg_new(msg_sz, GFP_KERNEL); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..15bf45b6f912 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -902,6 +902,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + u64 avruntime; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -925,6 +926,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) right_vruntime = last->vruntime; zero_vruntime = cfs_rq->zero_vruntime; + avruntime = avg_vruntime(cfs_rq); raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", @@ -934,7 +936,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", - SPLIT_NS(avg_vruntime(cfs_rq))); + SPLIT_NS(avruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 26a6ac2f8826..064eaa76be4b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1110,15 +1110,6 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, p->scx.dsq = dsq; /* - * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the - * direct dispatch path, but we clear them here because the direct - * dispatch verdict may be overridden on the enqueue path during e.g. - * bypass. - */ - p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; - p->scx.ddsp_enq_flags = 0; - - /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to * match waiters' load_acquire. */ @@ -1283,12 +1274,34 @@ static void mark_direct_dispatch(struct scx_sched *sch, p->scx.ddsp_enq_flags = enq_flags; } +/* + * Clear @p direct dispatch state when leaving the scheduler. + * + * Direct dispatch state must be cleared in the following cases: + * - direct_dispatch(): cleared on the synchronous enqueue path, deferred + * dispatch keeps the state until consumed + * - process_ddsp_deferred_locals(): cleared after consuming deferred state, + * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch + * verdict is ignored (local/global/bypass) + * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred + * cancellation and holding_cpu races + * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by + * the scx_bypass() loop, so that stale state is not reused by a subsequent + * scheduler instance + */ +static inline void clear_direct_dispatch(struct task_struct *p) +{ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; +} + static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + u64 ddsp_enq_flags; touch_core_sched_dispatch(rq, p); @@ -1329,8 +1342,10 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, return; } - dispatch_enqueue(sch, dsq, p, - p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + ddsp_enq_flags = p->scx.ddsp_enq_flags; + clear_direct_dispatch(p); + + dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -1439,6 +1454,7 @@ enqueue: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); + clear_direct_dispatch(p); dispatch_enqueue(sch, dsq, p, enq_flags); } @@ -1610,6 +1626,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags sub_nr_running(rq, 1); dispatch_dequeue(rq, p); + clear_direct_dispatch(p); return true; } @@ -2293,13 +2310,15 @@ static void process_ddsp_deferred_locals(struct rq *rq) struct task_struct, scx.dsq_list.node))) { struct scx_sched *sch = scx_root; struct scx_dispatch_q *dsq; + u64 dsq_id = p->scx.ddsp_dsq_id; + u64 enq_flags = p->scx.ddsp_enq_flags; list_del_init(&p->scx.dsq_list.node); + clear_direct_dispatch(p); - dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + dsq = find_dsq_for_dispatch(sch, rq, dsq_id, p); if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(sch, rq, dsq, p, - p->scx.ddsp_enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); } } @@ -2404,7 +2423,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, { struct scx_sched *sch = scx_root; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); update_curr_scx(rq); @@ -2447,6 +2466,48 @@ switch_class: switch_class(rq, next); } +static void kick_sync_wait_bal_cb(struct rq *rq) +{ + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; + bool waited; + s32 cpu; + + /* + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled + * — a target CPU may be waiting for us to process an IPI (e.g. TLB + * flush) while we wait for its kick_sync to advance. + * + * Also, keep advancing our own kick_sync so that new kick_sync waits + * targeting us, which can start after we drop the lock, cannot form + * cyclic dependencies. + */ +retry: + waited = false; + for_each_cpu(cpu, rq->scx.cpus_to_sync) { + /* + * smp_load_acquire() pairs with smp_store_release() on + * kick_sync updates on the target CPUs. + */ + if (cpu == cpu_of(rq) || + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); + continue; + } + + raw_spin_rq_unlock_irq(rq); + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + cpu_relax(); + } + raw_spin_rq_lock_irq(rq); + waited = true; + } + + if (waited) + goto retry; +} + static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, @@ -2460,7 +2521,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) bool keep_prev; struct task_struct *p; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); rq_modified_begin(rq, &ext_sched_class); @@ -2471,6 +2532,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) maybe_queue_balance_callback(rq); /* + * Defer to a balance callback which can drop rq lock and enable + * IRQs. Waiting directly in the pick path would deadlock against + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. + */ + if (unlikely(rq->scx.kick_sync_pending)) { + rq->scx.kick_sync_pending = false; + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, + kick_sync_wait_bal_cb); + } + + /* * If any higher-priority sched class enqueued a runnable task on * this rq during balance_one(), abort and return RETRY_TASK, so * that the scheduler loop can restart. @@ -2962,6 +3034,8 @@ static void scx_disable_task(struct task_struct *p) lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + clear_direct_dispatch(p); + if (SCX_HAS_OP(sch, disable)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); @@ -4713,6 +4787,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (!cpumask_empty(rq->scx.cpus_to_wait)) dump_line(&ns, " cpus_to_wait : %*pb", cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { @@ -5610,11 +5687,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { if (cur_class == &ext_sched_class) { + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); ksyncs[cpu] = rq->scx.kick_sync; should_wait = true; - } else { - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } resched_curr(rq); @@ -5669,27 +5746,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } - if (!should_wait) - return; - - for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - - /* - * Busy-wait until the task running at the time of kicking is no - * longer running. This can be used to implement e.g. core - * scheduling. - * - * smp_cond_load_acquire() pairs with store_releases in - * pick_task_scx() and put_prev_task_scx(). The former breaks - * the wait if SCX's scheduling path is entered even if the same - * task is picked subsequently. The latter is necessary to break - * the wait when $cpu is taken by a higher sched class. - */ - if (cpu != cpu_of(this_rq)) - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); - - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + /* + * Can't wait in hardirq — kick_sync can't advance, deadlocking if + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). + */ + if (should_wait) { + raw_spin_rq_lock(this_rq); + this_scx->kick_sync_pending = true; + resched_curr(this_rq); + raw_spin_rq_unlock(this_rq); } } @@ -5794,6 +5859,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index ba298ac3ce6c..44c3a50c542c 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -543,7 +543,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * piled up on it even if there is an idle core elsewhere on * the system. */ - waker_node = cpu_to_node(cpu); + waker_node = scx_cpu_node_if_enabled(cpu); if (!(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && @@ -860,25 +860,32 @@ static bool check_builtin_idle_enabled(struct scx_sched *sch) * code. * * We can't simply check whether @p->migration_disabled is set in a - * sched_ext callback, because migration is always disabled for the current - * task while running BPF code. + * sched_ext callback, because the BPF prolog (__bpf_prog_enter) may disable + * migration for the current task while running BPF code. * - * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively - * disable and re-enable migration. For this reason, the current task - * inside a sched_ext callback is always a migration-disabled task. + * Since the BPF prolog calls migrate_disable() only when CONFIG_PREEMPT_RCU + * is enabled (via rcu_read_lock_dont_migrate()), migration_disabled == 1 for + * the current task is ambiguous only in that case: it could be from the BPF + * prolog rather than a real migrate_disable() call. * - * Therefore, when @p->migration_disabled == 1, check whether @p is the - * current task or not: if it is, then migration was not disabled before - * entering the callback, otherwise migration was disabled. + * Without CONFIG_PREEMPT_RCU, the BPF prolog never calls migrate_disable(), + * so migration_disabled == 1 always means the task is truly + * migration-disabled. + * + * Therefore, when migration_disabled == 1 and CONFIG_PREEMPT_RCU is enabled, + * check whether @p is the current task or not: if it is, then migration was + * not disabled before entering the callback, otherwise migration was disabled. * * Returns true if @p is migration-disabled, false otherwise. */ static bool is_bpf_migration_disabled(const struct task_struct *p) { - if (p->migration_disabled == 1) - return p != current; - else - return p->migration_disabled; + if (p->migration_disabled == 1) { + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return p != current; + return true; + } + return p->migration_disabled; } static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf948db905ed..ab4114712be7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -707,7 +707,7 @@ void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) * Called in: * - place_entity() -- before enqueue * - update_entity_lag() -- before dequeue - * - entity_tick() + * - update_deadline() -- slice expiration * * This means it is one entry 'behind' but that puts it close enough to where * the bound on entity_key() is at most two lag bounds. @@ -1131,6 +1131,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) * EEVDF: vd_i = ve_i + r_i / w_i */ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); + avg_vruntime(cfs_rq); /* * The task has consumed its request, reschedule. @@ -5593,11 +5594,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_load_avg(cfs_rq, curr, UPDATE_TG); update_cfs_group(curr); - /* - * Pulls along cfs_rq::zero_vruntime. - */ - avg_vruntime(cfs_rq); - #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -9128,7 +9124,7 @@ static void yield_task_fair(struct rq *rq) */ if (entity_eligible(cfs_rq, se)) { se->vruntime = se->deadline; - se->deadline += calc_delta_fair(se->slice, se); + update_deadline(cfs_rq, se); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 43bbf0693cca..1ef9ba480f51 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -805,9 +805,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; + cpumask_var_t cpus_to_sync; + bool kick_sync_pending; unsigned long kick_sync; local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; + struct balance_callback kick_sync_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; struct scx_dispatch_q bypass_dsq; diff --git a/kernel/sys.c b/kernel/sys.c index c86eba9aa7e9..62e842055cc9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2388,17 +2388,18 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st return -EINVAL; } -int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status) +int __weak arch_prctl_get_branch_landing_pad_state(struct task_struct *t, + unsigned long __user *state) { return -EINVAL; } -int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status) +int __weak arch_prctl_set_branch_landing_pad_state(struct task_struct *t, unsigned long state) { return -EINVAL; } -int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status) +int __weak arch_prctl_lock_branch_landing_pad_state(struct task_struct *t) { return -EINVAL; } @@ -2888,20 +2889,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = rseq_slice_extension_prctl(arg2, arg3); break; - case PR_GET_INDIR_BR_LP_STATUS: - if (arg3 || arg4 || arg5) + case PR_GET_CFI: + if (arg2 != PR_CFI_BRANCH_LANDING_PADS) return -EINVAL; - error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2); - break; - case PR_SET_INDIR_BR_LP_STATUS: - if (arg3 || arg4 || arg5) + if (arg4 || arg5) return -EINVAL; - error = arch_set_indir_br_lp_status(me, arg2); + error = arch_prctl_get_branch_landing_pad_state(me, (unsigned long __user *)arg3); break; - case PR_LOCK_INDIR_BR_LP_STATUS: - if (arg3 || arg4 || arg5) + case PR_SET_CFI: + if (arg2 != PR_CFI_BRANCH_LANDING_PADS) return -EINVAL; - error = arch_lock_indir_br_lp_status(me, arg2); + if (arg4 || arg5) + return -EINVAL; + error = arch_prctl_set_branch_landing_pad_state(me, arg3); + if (error) + break; + if (arg3 & PR_CFI_LOCK && !(arg3 & PR_CFI_DISABLE)) + error = arch_prctl_lock_branch_landing_pad_state(me); break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b040a417442..af7079aa0f36 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2752,6 +2752,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* kprobe_multi is not allowed to be sleepable. */ + if (prog->sleepable) + return -EINVAL; + /* Writing to context is not allowed for kprobes. */ if (prog->aux->kprobe_write_ctx) return -EINVAL; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b77119d71641..eda756556341 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7699,8 +7699,29 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) else ts = touched; - /* did we stall? */ + /* + * Did we stall? + * + * Do a lockless check first to do not disturb the system. + * + * Prevent false positives by double checking the timestamp + * under pool->lock. The lock makes sure that the check reads + * an updated pool->last_progress_ts when this CPU saw + * an already updated pool->worklist above. It seems better + * than adding another barrier into __queue_work() which + * is a hotter path. + */ if (time_after(now, ts + thresh)) { + scoped_guard(raw_spinlock_irqsave, &pool->lock) { + pool_ts = pool->last_progress_ts; + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + } + if (!time_after(now, ts + thresh)) + continue; + lockup_detected = true; stall_time = jiffies_to_msecs(now - pool_ts) / 1000; max_stall_time = max(max_stall_time, stall_time); @@ -7712,8 +7733,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", stall_time); } - - } if (lockup_detected) |
