diff options
| author | Tejun Heo <tj@kernel.org> | 2026-05-22 07:06:01 -1000 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2026-05-25 09:44:07 -1000 |
| commit | abdc2516f100d8f9e637a49e4fdfd2d09a318680 (patch) | |
| tree | 65641f72ea7535bdabddb600bdea3b8fee8d7a79 /kernel/sched | |
| parent | 9eca087deb0b35f3170109a9630a6c5c06c2e222 (diff) | |
sched_ext: Convert ops.set_cmask() to arena-resident cmask
ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.
With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping.
BPF directly dereferences it via an __arena pointer like any other arena
struct.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/ext.c | 68 | ||||
| -rw-r--r-- | kernel/sched/ext_cid.c | 20 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 10 |
3 files changed, 72 insertions, 26 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f5c67e3ff075..83272acf1763 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq, update_locked_rq(rq); if (scx_is_cid_type()) { - struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch); - - lockdep_assert_irqs_disabled(); - scx_cpumask_to_cmask(cpumask, cmask); - sch->ops_cid.set_cmask(task, cmask); + struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch); + unsigned long uaddr = (unsigned long)kern_va - + bpf_arena_map_kern_vm_start(sch->arena_map); + /* + * Build the per-CPU arena cmask and hand BPF the uaddr. Caller + * holds the rq lock with IRQs disabled, which makes us the sole + * user of the scratch area. + */ + scx_cpumask_to_cmask(cpumask, kern_va); + sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr); } else { sch->ops.set_cpumask(task, cpumask); } @@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = { static void free_pnode(struct scx_sched_pnode *pnode); static void free_exit_info(struct scx_exit_info *ei); +static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch) +{ + size_t size = struct_size_t(struct scx_cmask, bits, + SCX_CMASK_NR_WORDS(num_possible_cpus())); + int cpu; + + if (!sch->is_cid_type || !sch->arena_pool) + return 0; + + sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *); + if (!sch->set_cmask_scratch) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); + + *slot = scx_arena_alloc(sch, size); + if (!*slot) + return -ENOMEM; + scx_cmask_init(*slot, 0, num_possible_cpus()); + } + return 0; +} + +static void scx_set_cmask_scratch_free(struct scx_sched *sch) +{ + size_t size = struct_size_t(struct scx_cmask, bits, + SCX_CMASK_NR_WORDS(num_possible_cpus())); + int cpu; + + if (!sch->set_cmask_scratch) + return; + + for_each_possible_cpu(cpu) { + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); + + scx_arena_free(sch, *slot, size); + } + free_percpu(sch->set_cmask_scratch); + sch->set_cmask_scratch = NULL; +} + static void scx_sched_free_rcu_work(struct work_struct *work) { struct rcu_work *rcu_work = to_rcu_work(work); @@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); free_exit_info(sch->exit_info); + scx_set_cmask_scratch_free(sch); scx_arena_pool_destroy(sch); if (sch->arena_map) bpf_map_put(sch->arena_map); @@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable; } + ret = scx_set_cmask_scratch_alloc(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i]) set_bit(i, sch->has_op); @@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work) if (ret) goto err_disable; + ret = scx_set_cmask_scratch_alloc(sch); + if (ret) + goto err_disable; + if (validate_ops(sch, ops)) goto err_disable; diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c index 0c91b951fd33..808c6390da5a 100644 --- a/kernel/sched/ext_cid.c +++ b/kernel/sched/ext_cid.c @@ -8,14 +8,6 @@ #include <linux/cacheinfo.h> /* - * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a - * cmask from a cpumask. Allocated alongside the cid arrays on first enable - * and never freed. Sized to the full cid space. Caller holds rq lock so - * this_cpu_ptr is safe. - */ -struct scx_cmask __percpu *scx_set_cmask_scratch; - -/* * cid tables. * * Pointers are published once on first enable and never revoked. The default @@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void) u32 npossible = num_possible_cpus(); s16 *cid_to_cpu, *cpu_to_cid; struct scx_cid_topo *cid_topo; - struct scx_cmask __percpu *set_cmask_scratch; - s32 cpu; if (scx_cid_to_cpu_tbl) return 0; @@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void) cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL); cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL); cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL); - set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits, - SCX_CMASK_NR_WORDS(npossible)), - sizeof(u64)); - if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) { + if (!cid_to_cpu || !cpu_to_cid || !cid_topo) { kfree(cid_to_cpu); kfree(cpu_to_cid); kfree(cid_topo); - free_percpu(set_cmask_scratch); return -ENOMEM; } WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu); WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid); WRITE_ONCE(scx_cid_topo, cid_topo); - for_each_possible_cpu(cpu) - scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu), - 0, npossible); - WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch); return 0; } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index ff7e882bd67a..9bb65367f510 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1124,6 +1124,14 @@ struct scx_sched { struct bpf_map *arena_map; struct gen_pool *arena_pool; + /* + * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask + * to ops_cid.set_cmask(). The kernel writes through the stored kern_va; + * the BPF-arena uaddr handed to BPF is recovered by subtracting the + * arena's kern_vm_start. + */ + struct scx_cmask * __percpu *set_cmask_scratch; + DECLARE_BITMAP(has_op, SCX_OPI_END); /* @@ -1480,8 +1488,6 @@ enum scx_ops_state { extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); -extern struct scx_cmask __percpu *scx_set_cmask_scratch; - /* * True when the currently loaded scheduler hierarchy is cid-form. All scheds * in a hierarchy share one form, so this single key tells callsites which |
