summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorSimona Vetter <simona.vetter@ffwll.ch>2025-01-23 14:39:49 +0100
committerSimona Vetter <simona.vetter@ffwll.ch>2025-01-23 14:42:21 +0100
commit07c5b277208cf9a9e9cf31bf0143977d7f030aa1 (patch)
tree8c30e64d4b9bbb433e50785fb40860ee5c46a4eb /kernel
parent951a6bf30667307e7901aac5e74e50dadd5ccfc7 (diff)
parentffd294d346d185b70e28b1a28abe367bbfe53c04 (diff)
Merge v6.13 into drm-next
A regression was caused by commit e4b5ccd392b9 ("drm/v3d: Ensure job pointer is set to NULL after job completion"), but this commit is not yet in next-fixes, fast-forward it. Note that this recreates Linus merge in 96c84703f1cf ("Merge tag 'drm-next-2025-01-17' of https://gitlab.freedesktop.org/drm/kernel") because I didn't want to backmerge a random point in the merge window. Signed-off-by: Simona Vetter <simona.vetter@ffwll.ch>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cpuset.c44
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/events/uprobes.c2
-rwxr-xr-xkernel/gen_kheaders.sh1
-rw-r--r--kernel/sched/ext.c87
-rw-r--r--kernel/sched/ext.h8
-rw-r--r--kernel/sched/fair.c151
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/signal.c37
-rw-r--r--kernel/time/hrtimer.c11
-rw-r--r--kernel/time/timer_migration.c64
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--kernel/trace/trace_irqsoff.c14
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_sched_wakeup.c14
-rw-r--r--kernel/workqueue.c7
16 files changed, 244 insertions, 210 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f321ed515f3a..0f910c828973 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -197,10 +197,8 @@ static struct cpuset top_cpuset = {
/*
* There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. We also require taking task_lock() when dereferencing a
- * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
- * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
+ * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
* structures. Note that cpuset_mutex needs to be a mutex as it is used in
* paths that rely on priority inheritance (e.g. scheduler - on RT) for
* correctness.
@@ -229,9 +227,6 @@ static struct cpuset top_cpuset = {
* The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
- *
- * Accessing a task's cpuset should be done in accordance with the
- * guidelines for accessing subsystem state in kernel/cgroup.c
*/
static DEFINE_MUTEX(cpuset_mutex);
@@ -890,7 +885,15 @@ v2:
*/
if (cgrpv2) {
for (i = 0; i < ndoms; i++) {
- cpumask_copy(doms[i], csa[i]->effective_cpus);
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (csa[i] == &top_cpuset)
+ cpumask_and(doms[i], csa[i]->effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
dattr[i] = SD_ATTR_INIT;
}
@@ -3121,29 +3124,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
int retval = -ENODEV;
buf = strstrip(buf);
-
- /*
- * CPU or memory hotunplug may leave @cs w/o any execution
- * resources, in which case the hotplug code asynchronously updates
- * configuration and transfers all tasks to the nearest ancestor
- * which can execute.
- *
- * As writes to "cpus" or "mems" may restore @cs's execution
- * resources, wait for the previously scheduled operations before
- * proceeding, so that we don't end up keep removing tasks added
- * after execution capability is restored.
- *
- * cpuset_handle_hotplug may call back into cgroup core asynchronously
- * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
- * operation like this one can lead to a deadlock through kernfs
- * active_ref protection. Let's break the protection. Losing the
- * protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
- * hierarchies.
- */
- css_get(&cs->css);
- kernfs_break_active_protection(of->kn);
-
cpus_read_lock();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
@@ -3176,8 +3156,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
out_unlock:
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
- kernfs_unbreak_active_protection(of->kn);
- css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b605334f8ee6..0509a9733745 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
},
[CPUHP_AP_HRTIMERS_DYING] = {
.name = "hrtimers:dying",
- .startup.single = NULL,
+ .startup.single = hrtimers_cpu_starting,
.teardown.single = hrtimers_cpu_dying,
},
[CPUHP_AP_TICK_DYING] = {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index fa04b14a7d72..5d71ef85420c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1915,6 +1915,7 @@ void uprobe_free_utask(struct task_struct *t)
if (!utask)
return;
+ t->utask = NULL;
WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
timer_delete_sync(&utask->ri_timer);
@@ -1924,7 +1925,6 @@ void uprobe_free_utask(struct task_struct *t)
ri = free_ret_instance(ri, true /* cleanup_hprobe */);
kfree(utask);
- t->utask = NULL;
}
#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 383fd43ac612..7e1340da5aca 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 |
# Create archive and try to normalize metadata for reproducibility.
tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+ --exclude=".__afs*" --exclude=".nfs*" \
--owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
-I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 19d2699cf638..19813b387ef9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
int nr_loops = SCX_DSP_MAX_LOOPS;
lockdep_assert_rq_held(rq);
@@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* See scx_ops_disable_workfn() for the explanation on the
* bypassing test.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- prev->scx.slice && !scx_rq_bypassing(rq)) {
+ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
}
@@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
flush_dispatch_buf(rq);
+ if (prev_on_rq && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
if (rq->scx.local_dsq.nr)
goto has_tasks;
if (consume_global_dsq(rq))
@@ -2838,8 +2842,7 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- (!static_branch_unlikely(&scx_ops_enq_last) ||
+ if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
@@ -3034,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
- return;
+ goto switch_class;
}
/*
@@ -3051,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
}
}
+switch_class:
if (next && next->sched_class != &ext_sched_class)
switch_class(rq, next);
}
@@ -3586,16 +3590,8 @@ static void reset_idle_masks(void)
cpumask_copy(idle_masks.smt, cpu_online_mask);
}
-void __scx_update_idle(struct rq *rq, bool idle)
+static void update_builtin_idle(int cpu, bool idle)
{
- int cpu = cpu_of(rq);
-
- if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
- if (!static_branch_unlikely(&scx_builtin_idle_enabled))
- return;
- }
-
if (idle)
cpumask_set_cpu(cpu, idle_masks.cpu);
else
@@ -3622,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
#endif
}
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -4744,10 +4791,9 @@ static void scx_ops_bypass(bool bypass)
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
struct task_struct *p, *n;
- rq_lock(rq, &rf);
+ raw_spin_rq_lock(rq);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4763,7 +4809,7 @@ static void scx_ops_bypass(bool bypass)
* sees scx_rq_bypassing() before moving tasks to SCX.
*/
if (!scx_enabled()) {
- rq_unlock(rq, &rf);
+ raw_spin_rq_unlock(rq);
continue;
}
@@ -4783,10 +4829,11 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
- rq_unlock(rq, &rf);
-
/* resched to restore ticks and idle state */
- resched_cpu(cpu);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+
+ raw_spin_rq_unlock(rq);
}
atomic_dec(&scx_ops_breather_depth);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index b1675bb59fc4..4d022d17ac7d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-void __scx_update_idle(struct rq *rq, bool idle);
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
-static inline void scx_update_idle(struct rq *rq, bool idle)
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
if (scx_enabled())
- __scx_update_idle(rq, idle);
+ __scx_update_idle(rq, idle, do_notify);
}
#else
-static inline void scx_update_idle(struct rq *rq, bool idle) {}
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#endif
#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e9ca38512de..26958431deb7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -689,21 +689,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static s64 entity_lag(u64 avruntime, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- vlag = avruntime - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
-
- return clamp(vlag, -limit, limit);
-}
-
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
SCHED_WARN_ON(!se->on_rq);
- se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
+ vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ se->vlag = clamp(vlag, -limit, limit);
}
/*
@@ -3774,137 +3769,32 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
-static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
- unsigned long weight)
-{
- unsigned long old_weight = se->load.weight;
- s64 vlag, vslice;
-
- /*
- * VRUNTIME
- * --------
- *
- * COROLLARY #1: The virtual runtime of the entity needs to be
- * adjusted if re-weight at !0-lag point.
- *
- * Proof: For contradiction assume this is not true, so we can
- * re-weight without changing vruntime at !0-lag point.
- *
- * Weight VRuntime Avg-VRuntime
- * before w v V
- * after w' v' V'
- *
- * Since lag needs to be preserved through re-weight:
- *
- * lag = (V - v)*w = (V'- v')*w', where v = v'
- * ==> V' = (V - v)*w/w' + v (1)
- *
- * Let W be the total weight of the entities before reweight,
- * since V' is the new weighted average of entities:
- *
- * V' = (WV + w'v - wv) / (W + w' - w) (2)
- *
- * by using (1) & (2) we obtain:
- *
- * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
- * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
- * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
- * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
- *
- * Since we are doing at !0-lag point which means V != v, we
- * can simplify (3):
- *
- * ==> W / (W + w' - w) = w / w'
- * ==> Ww' = Ww + ww' - ww
- * ==> W * (w' - w) = w * (w' - w)
- * ==> W = w (re-weight indicates w' != w)
- *
- * So the cfs_rq contains only one entity, hence vruntime of
- * the entity @v should always equal to the cfs_rq's weighted
- * average vruntime @V, which means we will always re-weight
- * at 0-lag point, thus breach assumption. Proof completed.
- *
- *
- * COROLLARY #2: Re-weight does NOT affect weighted average
- * vruntime of all the entities.
- *
- * Proof: According to corollary #1, Eq. (1) should be:
- *
- * (V - v)*w = (V' - v')*w'
- * ==> v' = V' - (V - v)*w/w' (4)
- *
- * According to the weighted average formula, we have:
- *
- * V' = (WV - wv + w'v') / (W - w + w')
- * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
- * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
- * = (WV + w'V' - Vw) / (W - w + w')
- *
- * ==> V'*(W - w + w') = WV + w'V' - Vw
- * ==> V' * (W - w) = (W - w) * V (5)
- *
- * If the entity is the only one in the cfs_rq, then reweight
- * always occurs at 0-lag point, so V won't change. Or else
- * there are other entities, hence W != w, then Eq. (5) turns
- * into V' = V. So V won't change in either case, proof done.
- *
- *
- * So according to corollary #1 & #2, the effect of re-weight
- * on vruntime should be:
- *
- * v' = V' - (V - v) * w / w' (4)
- * = V - (V - v) * w / w'
- * = V - vl * w / w'
- * = V - vl'
- */
- if (avruntime != se->vruntime) {
- vlag = entity_lag(avruntime, se);
- vlag = div_s64(vlag * old_weight, weight);
- se->vruntime = avruntime - vlag;
- }
-
- /*
- * DEADLINE
- * --------
- *
- * When the weight changes, the virtual time slope changes and
- * we should adjust the relative virtual deadline accordingly.
- *
- * d' = v' + (d - v)*w/w'
- * = V' - (V - v)*w/w' + (d - v)*w/w'
- * = V - (V - v)*w/w' + (d - v)*w/w'
- * = V + (d - V)*w/w'
- */
- vslice = (s64)(se->deadline - avruntime);
- vslice = div_s64(vslice * old_weight, weight);
- se->deadline = avruntime + vslice;
-}
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
- u64 avruntime;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- avruntime = avg_vruntime(cfs_rq);
+ update_entity_lag(cfs_rq, se);
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- if (se->on_rq) {
- reweight_eevdf(se, avruntime, weight);
- } else {
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- }
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ if (se->rel_deadline)
+ se->deadline = div_s64(se->deadline * se->load.weight, weight);
update_load_set(&se->load, weight);
@@ -3919,6 +3809,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
+ place_entity(cfs_rq, se, 0);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -4065,7 +3956,11 @@ static void update_cfs_group(struct sched_entity *se)
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
- if (!gcfs_rq)
+ /*
+ * When a group becomes empty, preserve its weight. This matters for
+ * DELAY_DEQUEUE.
+ */
+ if (!gcfs_rq || !gcfs_rq->load.weight)
return;
if (throttled_hierarchy(gcfs_rq))
@@ -5359,7 +5254,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
- if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+ if (se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 621696269584..2c85c86b455f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
dl_server_update_idle_time(rq, prev);
- scx_update_idle(rq, false);
+ scx_update_idle(rq, false, true);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
- scx_update_idle(rq, true);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}
struct task_struct *pick_task_idle(struct rq *rq)
{
+ scx_update_idle(rq, true, false);
return rq->idle;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 989b1cc9116a..a2afd54303f0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
if (!list_empty(&q->list)) {
/*
- * If task group is exiting with the signal already pending,
- * wait for __exit_signal() to do its job. Otherwise if
- * ignored, it's not supposed to be queued. Try to survive.
+ * The signal was ignored and blocked. The timer
+ * expiry queued it because blocked signals are
+ * queued independent of the ignored state.
+ *
+ * The unblocking set SIGPENDING, but the signal
+ * was not yet dequeued from the pending list.
+ * So prepare_signal() sees unblocked and ignored,
+ * which ends up here. Leave it queued like a
+ * regular signal.
+ *
+ * The same happens when the task group is exiting
+ * and the signal is already queued.
+ * prepare_signal() treats SIGNAL_GROUP_EXIT as
+ * ignored independent of its queued state. This
+ * gets cleaned up in __exit_signal().
*/
- WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT));
goto out;
}
@@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
goto out;
}
- /* This should never happen and leaks a reference count */
- if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
- hlist_del_init(&tmr->ignored_list);
-
if (unlikely(!list_empty(&q->list))) {
/* This holds a reference count already */
result = TRACE_SIGNAL_ALREADY_PENDING;
goto out;
}
- posixtimer_sigqueue_getref(q);
+ /*
+ * If the signal is on the ignore list, it got blocked after it was
+ * ignored earlier. But nothing lifted the ignore. Move it back to
+ * the pending list to be consistent with the regular signal
+ * handling. This already holds a reference count.
+ *
+ * If it's not on the ignore list acquire a reference count.
+ */
+ if (likely(hlist_unhashed(&tmr->ignored_list)))
+ posixtimer_sigqueue_getref(q);
+ else
+ hlist_del_init(&tmr->ignored_list);
+
posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
result = TRACE_SIGNAL_DELIVERED;
out:
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 80fe3749d2db..030426c8c944 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2202,6 +2202,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
}
cpu_base->cpu = cpu;
+ hrtimer_cpu_base_init_expiry_lock(cpu_base);
+ return 0;
+}
+
+int hrtimers_cpu_starting(unsigned int cpu)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+
+ /* Clear out any left over state from a CPU down operation */
cpu_base->active_bases = 0;
cpu_base->hres_active = 0;
cpu_base->hang_detected = 0;
@@ -2210,7 +2219,6 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->online = 1;
- hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2286,5 +2294,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
+ hrtimers_cpu_starting(smp_processor_id());
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 8d57f7686bb0..066c9ddca4ec 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
break;
child = group;
- group = group->parent;
+ /*
+ * Pairs with the store release on group connection
+ * to make sure group initialization is visible.
+ */
+ group = READ_ONCE(group->parent);
data->childmask = child->groupmask;
+ WARN_ON_ONCE(!data->childmask);
} while (group);
}
@@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
while ((node = timerqueue_getnext(&group->events))) {
evt = container_of(node, struct tmigr_event, nextevt);
- if (!evt->ignore) {
+ if (!READ_ONCE(evt->ignore)) {
WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
return evt;
}
@@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
* lock is held while updating the ignore flag in idle path. So this
* state change will not be lost.
*/
- group->groupevt.ignore = true;
+ WRITE_ONCE(group->groupevt.ignore, true);
return walk_done;
}
@@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
union tmigr_state childstate, groupstate;
bool remote = data->remote;
bool walk_done = false;
+ bool ignore;
u64 nextexp;
if (child) {
@@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
nextexp = child->next_expiry;
evt = &child->groupevt;
- evt->ignore = (nextexp == KTIME_MAX) ? true : false;
+ /*
+ * This can race with concurrent idle exit (activate).
+ * If the current writer wins, a useless remote expiration may
+ * be scheduled. If the activate wins, the event is properly
+ * ignored.
+ */
+ ignore = (nextexp == KTIME_MAX) ? true : false;
+ WRITE_ONCE(evt->ignore, ignore);
} else {
nextexp = data->nextexp;
first_childevt = evt = data->evt;
+ ignore = evt->ignore;
/*
* Walking the hierarchy is required in any case when a
@@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* first event information of the group is updated properly and
* also handled properly, so skip this fast return path.
*/
- if (evt->ignore && !remote && group->parent)
+ if (ignore && !remote && group->parent)
return true;
raw_spin_lock(&group->lock);
@@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* queue when the expiry time changed only or when it could be ignored.
*/
if (timerqueue_node_queued(&evt->nextevt)) {
- if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
+ if ((evt->nextevt.expires == nextexp) && !ignore) {
/* Make sure not to miss a new CPU event with the same expiry */
evt->cpu = first_childevt->cpu;
goto check_toplvl;
@@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
WRITE_ONCE(group->next_expiry, KTIME_MAX);
}
- if (evt->ignore) {
+ if (ignore) {
/*
* When the next child event could be ignored (nextexp is
* KTIME_MAX) and there was no remote timer handling before or
@@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0;
atomic_set(&group->migr_state, s.state);
+ /*
+ * If this is a new top-level, prepare its groupmask in advance.
+ * This avoids accidents where yet another new top-level is
+ * created in the future and made visible before the current groupmask.
+ */
+ if (list_empty(&tmigr_level_list[lvl])) {
+ group->groupmask = BIT(0);
+ /*
+ * The previous top level has prepared its groupmask already,
+ * simply account it as the first child.
+ */
+ if (lvl > 0)
+ group->num_children = 1;
+ }
+
timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
raw_spin_lock_irq(&child->lock);
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
- child->parent = parent;
- child->groupmask = BIT(parent->num_children++);
+ if (activate) {
+ /*
+ * @child is the old top and @parent the new one. In this
+ * case groupmask is pre-initialized and @child already
+ * accounted, along with its new sibling corresponding to the
+ * CPU going up.
+ */
+ WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
+ } else {
+ /* Adding @child for the CPU going up to @parent. */
+ child->groupmask = BIT(parent->num_children++);
+ }
+
+ /*
+ * Make sure parent initialization is visible before publishing it to a
+ * racing CPU entering/exiting idle. This RELEASE barrier enforces an
+ * address dependency that pairs with the READ_ONCE() in __walk_groups().
+ */
+ smp_store_release(&child->parent, parent);
raw_spin_unlock(&parent->lock);
raw_spin_unlock_irq(&child->lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8aebcb01e62..b6e40e8791fa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4122,6 +4122,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
preempt_model_none() ? "server" :
preempt_model_voluntary() ? "desktop" :
preempt_model_full() ? "preempt" :
+ preempt_model_lazy() ? "lazy" :
preempt_model_rt() ? "preempt_rt" :
"unknown",
/* These are reserved for later use */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index fce064e20570..a4e799c1e767 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -182,6 +182,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
int ret;
if (ftrace_graph_ignore_func(gops, trace))
@@ -199,6 +200,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
if (!func_prolog_dec(tr, &data, &flags))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
trace_ctx = tracing_gen_ctx_flags(flags);
ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
@@ -213,12 +220,19 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
+ int size;
ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+ trace->calltime = *calltime;
+
trace_ctx = tracing_gen_ctx_flags(flags);
__trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 935a886af40c..0642ea174849 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -940,8 +940,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
- if (!symbol)
- return -ENOMEM;
+ if (!symbol) {
+ ret = -ENOMEM;
+ goto error;
+ }
tmp = strchr(symbol, '%');
if (tmp) {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6c7f18daa15..c58292e424d5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -118,6 +118,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
int ret = 0;
if (ftrace_graph_ignore_func(gops, trace))
@@ -135,6 +136,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -148,12 +155,19 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace,
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
+ int size;
ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+ trace->calltime = *calltime;
+
__trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f7d8fc204579..9362484a653c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2508,6 +2508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
return;
}
+ WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
@@ -2533,6 +2534,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
+ * We queue the delayed_work to a specific CPU, for non-zero delays the
+ * caller must ensure it is online and can't go away. Callers that fail
+ * to ensure this, may get @dwork->timer queued to an offlined CPU and
+ * this will prevent queueing of @dwork->work unless the offlined CPU
+ * becomes online again.
+ *
* Return: %false if @work was already on a queue, %true otherwise. If
* @delay is zero and @dwork is idle, it will be scheduled for immediate
* execution.