summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c6
-rw-r--r--kernel/cgroup/cpuset.c59
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/sched/core.c79
-rw-r--r--kernel/sched/ext.c22
-rw-r--r--kernel/sched/ext_internal.h114
-rw-r--r--kernel/sched/idle.c11
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/workqueue.c55
-rw-r--r--kernel/workqueue_internal.h1
11 files changed, 221 insertions, 139 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda317..01fc2a93f3ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5109,6 +5109,12 @@ repeat:
return;
task = list_entry(it->task_pos, struct task_struct, cg_list);
+ /*
+ * Hide tasks that are exiting but not yet removed. Keep zombie
+ * leaders with live threads visible.
+ */
+ if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e200de7c60b6..d21868455341 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -879,7 +879,7 @@ generate_doms:
/*
* Cgroup v2 doesn't support domain attributes, just set all of them
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
- * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs.
*/
for (i = 0; i < ndoms; i++) {
/*
@@ -888,7 +888,7 @@ generate_doms:
*/
if (!csa || csa[i] == &top_cpuset)
cpumask_and(doms[i], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
else
cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
@@ -1329,17 +1329,22 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
}
/*
- * update_hk_sched_domains - Update HK cpumasks & rebuild sched domains
+ * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock
*
- * Update housekeeping cpumasks and rebuild sched domains if necessary.
- * This should be called at the end of cpuset or hotplug actions.
+ * Update housekeeping cpumasks and rebuild sched domains if necessary and
+ * then do a cpuset_full_unlock().
+ * This should be called at the end of cpuset operation.
*/
-static void update_hk_sched_domains(void)
+static void cpuset_update_sd_hk_unlock(void)
+ __releases(&cpuset_mutex)
+ __releases(&cpuset_top_mutex)
{
+ /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
+
if (update_housekeeping) {
- /* Updating HK cpumasks implies rebuild sched domains */
update_housekeeping = false;
- force_sd_rebuild = true;
cpumask_copy(isolated_hk_cpus, isolated_cpus);
/*
@@ -1350,22 +1355,19 @@ static void update_hk_sched_domains(void)
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ mutex_unlock(&cpuset_top_mutex);
+ } else {
+ cpuset_full_unlock();
}
- /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
- if (force_sd_rebuild)
- rebuild_sched_domains_locked();
}
/*
- * Work function to invoke update_hk_sched_domains()
+ * Work function to invoke cpuset_update_sd_hk_unlock()
*/
static void hk_sd_workfn(struct work_struct *work)
{
cpuset_full_lock();
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
}
/**
@@ -3230,8 +3232,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
schedule_flush_migrate_mm();
return retval ?: nbytes;
@@ -3338,8 +3339,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
return retval ?: nbytes;
}
@@ -3513,8 +3513,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3923,11 +3922,13 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
-
/*
- * Queue a work to call housekeeping_update() & rebuild_sched_domains()
- * There will be a slight delay before the HK_TYPE_DOMAIN housekeeping
- * cpumask can correctly reflect what is in isolated_cpus.
+ * rebuild_sched_domains() will always be called directly if needed
+ * to make sure that newly added or removed CPU will be reflected in
+ * the sched domains. However, if isolated partition invalidation
+ * or recreation is being done (update_housekeeping set), a work item
+ * will be queued to call housekeeping_update() to update the
+ * corresponding housekeeping cpumasks after some slight delay.
*
* We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
* is still pending. Before the pending bit is cleared, the work data
@@ -3936,8 +3937,10 @@ static void cpuset_handle_hotplug(void)
* previously queued work. Since hk_sd_workfn() doesn't use the work
* item at all, this is not a problem.
*/
- if (update_housekeeping || force_sd_rebuild)
- queue_work(system_unbound_wq, &hk_sd_work);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_cpuslocked();
+ if (update_housekeeping)
+ queue_work(system_dfl_wq, &hk_sd_work);
free_tmpmasks(ptmp);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..bc2bf58b93b6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
+ INIT_HLIST_NODE(&tsk->mm_cid.node);
#endif
return tsk;
@@ -1586,7 +1587,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
- sched_mm_cid_fork(tsk);
return 0;
}
@@ -2498,7 +2498,6 @@ bad_fork_cleanup_namespaces:
exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
- sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ab25b4aa9095..bfc89083daa9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1144,12 +1144,12 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
lockdep_assert_held(&kprobe_mutex);
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
- if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
+ if (ret < 0)
return ret;
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
+ if (ret < 0) {
/*
* At this point, sinec ops is not registered, we should be sefe from
* registering empty filter.
@@ -1178,6 +1178,10 @@ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int ret;
lockdep_assert_held(&kprobe_mutex);
+ if (unlikely(kprobe_ftrace_disabled)) {
+ /* Now ftrace is disabled forever, disarm is already done. */
+ return 0;
+ }
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f77c165a6e..496dff740dca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4729,8 +4729,11 @@ void sched_cancel_fork(struct task_struct *p)
scx_cancel_fork(p);
}
+static void sched_mm_cid_fork(struct task_struct *t);
+
void sched_post_fork(struct task_struct *p)
{
+ sched_mm_cid_fork(p);
uclamp_post_fork(p);
scx_post_fork(p);
}
@@ -10617,13 +10620,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
}
}
-static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
guard(task_rq_lock)(t);
- /* If the task is not active it is not in the users count */
- if (!t->mm_cid.active)
- return false;
if (cid_on_task(t->mm_cid.cid)) {
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
@@ -10631,69 +10631,43 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
else
mm_unset_cid_on_task(t);
}
- return true;
}
-static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
+static void mm_cid_fixup_tasks_to_cpus(void)
{
- struct task_struct *p, *t;
- unsigned int users;
+ struct mm_struct *mm = current->mm;
+ struct task_struct *t;
- /*
- * This can obviously race with a concurrent affinity change, which
- * increases the number of allowed CPUs for this mm, but that does
- * not affect the mode and only changes the CID constraints. A
- * possible switch back to per task mode happens either in the
- * deferred handler function or in the next fork()/exit().
- *
- * The caller has already transferred. The newly incoming task is
- * already accounted for, but not yet visible.
- */
- users = mm->mm_cid.users - 2;
- if (!users)
- return;
+ lockdep_assert_held(&mm->mm_cid.mutex);
- guard(rcu)();
- for_other_threads(current, t) {
- if (mm_cid_fixup_task_to_cpu(t, mm))
- users--;
+ hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
+ /* Current has already transferred before invoking the fixup. */
+ if (t != current)
+ mm_cid_fixup_task_to_cpu(t, mm);
}
- if (!users)
- return;
-
- /* Happens only for VM_CLONE processes. */
- for_each_process_thread(p, t) {
- if (t == current || t->mm != mm)
- continue;
- if (mm_cid_fixup_task_to_cpu(t, mm)) {
- if (--users == 0)
- return;
- }
- }
-}
-
-static void mm_cid_fixup_tasks_to_cpus(void)
-{
- struct mm_struct *mm = current->mm;
-
- mm_cid_do_fixup_tasks_to_cpus(mm);
mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
+ lockdep_assert_held(&mm->mm_cid.lock);
+
t->mm_cid.active = 1;
+ hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
mm->mm_cid.users++;
return mm_update_max_cids(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
bool percpu;
- WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+ if (!mm)
+ return;
+
+ WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10732,12 +10706,13 @@ void sched_mm_cid_fork(struct task_struct *t)
static bool sched_mm_cid_remove_user(struct task_struct *t)
{
+ lockdep_assert_held(&t->mm->mm_cid.lock);
+
t->mm_cid.active = 0;
- scoped_guard(preempt) {
- /* Clear the transition bit */
- t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
- mm_unset_cid_on_task(t);
- }
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ hlist_del_init(&t->mm_cid.node);
t->mm->mm_cid.users--;
return mm_update_max_cids(t->mm);
}
@@ -10880,11 +10855,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
mutex_init(&mm->mm_cid.mutex);
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ INIT_HLIST_HEAD(&mm->mm_cid.user_list);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
#endif /* !CONFIG_SCHED_MM_CID */
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1594987d637b..26a6ac2f8826 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1103,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
- dsq->seq++;
+ WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
dsq_mod_nr(dsq, 1);
@@ -1470,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}
-static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
{
struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
+ u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
if (enq_flags & ENQUEUE_WAKEUP)
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
- enq_flags |= rq->scx.extra_enq_flags;
-
if (sticky_cpu >= 0)
p->scx.sticky_cpu = -1;
@@ -3908,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
* consider offloading iff the total queued duration is over the
* threshold.
*/
- min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
- if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
return 0;
raw_spin_rq_lock_irq(rq);
@@ -4137,7 +4136,7 @@ static void scx_bypass(bool bypass)
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
- WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
+ WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -5259,13 +5258,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!READ_ONCE(helper)) {
mutex_lock(&helper_mutex);
if (!helper) {
- helper = kthread_run_worker(0, "scx_enable_helper");
- if (IS_ERR_OR_NULL(helper)) {
- helper = NULL;
+ struct kthread_worker *w =
+ kthread_run_worker(0, "scx_enable_helper");
+ if (IS_ERR_OR_NULL(w)) {
mutex_unlock(&helper_mutex);
return -ENOMEM;
}
- sched_set_fifo(helper->task);
+ sched_set_fifo(w->task);
+ WRITE_ONCE(helper, w);
}
mutex_unlock(&helper_mutex);
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 11ebb744d893..00b450597f3e 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = {
};
/*
- * sched_ext_entity->ops_state
+ * Task Ownership State Machine (sched_ext_entity->ops_state)
*
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
+ * The sched_ext core uses this state machine to track task ownership
+ * between the SCX core and the BPF scheduler. This allows the BPF
+ * scheduler to dispatch tasks without strict ordering requirements, while
+ * the SCX core safely rejects invalid dispatches.
*
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
+ * State Transitions
*
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
+ * .------------> NONE (owned by SCX core)
+ * | | ^
+ * | enqueue | | direct dispatch
+ * | v |
+ * | QUEUEING -------'
+ * | |
+ * | enqueue |
+ * | completes |
+ * | v
+ * | QUEUED (owned by BPF scheduler)
+ * | |
+ * | dispatch |
+ * | |
+ * | v
+ * | DISPATCHING
+ * | |
+ * | dispatch |
+ * | completes |
+ * `---------------'
*
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
+ * State Descriptions
+ *
+ * - %SCX_OPSS_NONE:
+ * Task is owned by the SCX core. It's either on a run queue, running,
+ * or being manipulated by the core scheduler. The BPF scheduler has no
+ * claim on this task.
+ *
+ * - %SCX_OPSS_QUEUEING:
+ * Transitional state while transferring a task from the SCX core to
+ * the BPF scheduler. The task's rq lock is held during this state.
+ * Since QUEUEING is both entered and exited under the rq lock, dequeue
+ * can never observe this state (it would be a BUG). When finishing a
+ * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
+ * path busy-waits for it to leave this state (via wait_ops_state())
+ * before retrying.
+ *
+ * - %SCX_OPSS_QUEUED:
+ * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
+ * and the BPF scheduler is responsible for dispatching it. A QSEQ
+ * (queue sequence number) is embedded in this state to detect
+ * dispatch/dequeue races: if a task is dequeued and re-enqueued, the
+ * QSEQ changes and any in-flight dispatch operations targeting the old
+ * QSEQ are safely ignored.
+ *
+ * - %SCX_OPSS_DISPATCHING:
+ * Transitional state while transferring a task from the BPF scheduler
+ * back to the SCX core. This state indicates the BPF scheduler has
+ * selected the task for execution. When dequeue needs to take the task
+ * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
+ * busy-waits for it to leave this state (via wait_ops_state()) before
+ * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
+ *
+ * Memory Ordering
+ *
+ * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
+ * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
+ * and waiters must use atomic_long_read_acquire(). This ensures proper
+ * synchronization between concurrent operations.
+ *
+ * Cross-CPU Task Migration
+ *
+ * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
+ * grab the target CPU's rq lock because a concurrent dequeue might be
+ * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
+ * (deadlock).
+ *
+ * The sched_ext core uses a "lock dancing" protocol coordinated by
+ * p->scx.holding_cpu. When moving a task to a different rq:
+ *
+ * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
+ * 2. Set p->scx.holding_cpu to the current CPU
+ * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
+ * is set, so clearing DISPATCHING first prevents the circular wait
+ * (safe to lock the rq we need)
+ * 4. Unlock the current CPU's rq
+ * 5. Lock src_rq (where the task currently lives)
+ * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
+ * race (dequeue clears holding_cpu to -1 when it takes the task), in
+ * this case migration is aborted
+ * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
+ * into dst_rq's local DSQ (no lock swap needed)
+ * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
+ * src_rq, locks dst_rq, and performs the deactivate/activate
+ * migration cycle (dst_rq is held on return)
+ * 9. Unlock dst_rq and re-lock the current CPU's rq to restore
+ * the lock state expected by the caller
+ *
+ * If any verification fails, abort the migration.
+ *
+ * This state tracking allows the BPF scheduler to try to dispatch any task
+ * at any time regardless of its state. The SCX core can safely
+ * reject/ignore invalid dispatches, simplifying the BPF scheduler
+ * implementation.
*/
enum scx_ops_state {
SCX_OPSS_NONE, /* owned by the SCX core */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3681b6ad9276..b95449165122 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -221,7 +221,7 @@ static void cpuidle_idle_call(void)
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
- } else {
+ } else if (drv->state_count > 1) {
bool stop_tick = true;
/*
@@ -239,6 +239,15 @@ static void cpuidle_idle_call(void)
* Give the governor an opportunity to reflect on the outcome
*/
cpuidle_reflect(dev, entered_state);
+ } else {
+ tick_nohz_idle_retain_tick();
+
+ /*
+ * If there is only a single idle state (or none), there is
+ * nothing meaningful for the governor to choose. Skip the
+ * governor and always use state 0.
+ */
+ call_cpuidle(drv, dev, 0);
}
exit_idle:
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 36fd2313ae7e..0d832317d576 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -697,7 +697,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
*
* Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
*/
-u64 jiffies_64_to_clock_t(u64 x)
+notrace u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeaec79bc09c..b77119d71641 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -190,7 +190,7 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* L: flags */
- unsigned long watchdog_ts; /* L: watchdog timestamp */
+ unsigned long last_progress_ts; /* L: last forward progress timestamp */
bool cpu_stall; /* WD: stalled cpu bound pool */
/*
@@ -1697,7 +1697,7 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
- pwq->pool->watchdog_ts = jiffies;
+ pwq->pool->last_progress_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
@@ -2348,7 +2348,7 @@ retry:
*/
if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
if (list_empty(&pool->worklist))
- pool->watchdog_ts = jiffies;
+ pool->last_progress_ts = jiffies;
trace_workqueue_activate_work(work);
insert_work(pwq, work, &pool->worklist, work_flags);
@@ -3204,6 +3204,7 @@ __acquires(&pool->lock)
worker->current_pwq = pwq;
if (worker->task)
worker->current_at = worker->task->se.sum_exec_runtime;
+ worker->current_start = jiffies;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
@@ -3352,7 +3353,7 @@ static void process_scheduled_works(struct worker *worker)
while ((work = list_first_entry_or_null(&worker->scheduled,
struct work_struct, entry))) {
if (first) {
- worker->pool->watchdog_ts = jiffies;
+ worker->pool->last_progress_ts = jiffies;
first = false;
}
process_one_work(worker, work);
@@ -4850,7 +4851,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
- pool->watchdog_ts = jiffies;
+ pool->last_progress_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -6274,7 +6275,7 @@ static void pr_cont_worker_id(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- if (pool->flags & WQ_BH)
+ if (pool->flags & POOL_BH)
pr_cont("bh%s",
pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
else
@@ -6359,6 +6360,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_cont(" %s", comma ? "," : "");
pr_cont_worker_id(worker);
pr_cont(":%ps", worker->current_func);
+ pr_cont(" for %us",
+ jiffies_to_msecs(jiffies - worker->current_start) / 1000);
list_for_each_entry(work, &worker->scheduled, entry)
pr_cont_work(false, work, &pcws);
pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
@@ -6462,7 +6465,7 @@ static void show_one_worker_pool(struct worker_pool *pool)
/* How long the first pending work is waiting for a worker. */
if (!list_empty(&pool->worklist))
- hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
+ hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000;
/*
* Defer printing to avoid deadlocks in console drivers that
@@ -7580,11 +7583,11 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds
/*
* Show workers that might prevent the processing of pending work items.
- * The only candidates are CPU-bound workers in the running state.
- * Pending work items should be handled by another idle worker
- * in all other situations.
+ * A busy worker that is not running on the CPU (e.g. sleeping in
+ * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as
+ * effectively as a CPU-bound one, so dump every in-flight worker.
*/
-static void show_cpu_pool_hog(struct worker_pool *pool)
+static void show_cpu_pool_busy_workers(struct worker_pool *pool)
{
struct worker *worker;
unsigned long irq_flags;
@@ -7593,36 +7596,34 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
raw_spin_lock_irqsave(&pool->lock, irq_flags);
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
- if (task_is_running(worker->task)) {
- /*
- * Defer printing to avoid deadlocks in console
- * drivers that queue work while holding locks
- * also taken in their write paths.
- */
- printk_deferred_enter();
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
- pr_info("pool %d:\n", pool->id);
- sched_show_task(worker->task);
+ pr_info("pool %d:\n", pool->id);
+ sched_show_task(worker->task);
- printk_deferred_exit();
- }
+ printk_deferred_exit();
}
raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}
-static void show_cpu_pools_hogs(void)
+static void show_cpu_pools_busy_workers(void)
{
struct worker_pool *pool;
int pi;
- pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
+ pr_info("Showing backtraces of busy workers in stalled worker pools:\n");
rcu_read_lock();
for_each_pool(pool, pi) {
if (pool->cpu_stall)
- show_cpu_pool_hog(pool);
+ show_cpu_pool_busy_workers(pool);
}
@@ -7691,7 +7692,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
else
touched = READ_ONCE(wq_watchdog_touched);
- pool_ts = READ_ONCE(pool->watchdog_ts);
+ pool_ts = READ_ONCE(pool->last_progress_ts);
if (time_after(pool_ts, touched))
ts = pool_ts;
@@ -7719,7 +7720,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
show_all_workqueues();
if (cpu_pool_stall)
- show_cpu_pools_hogs();
+ show_cpu_pools_busy_workers();
if (lockup_detected)
panic_on_wq_watchdog(max_stall_time);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index f6275944ada7..8def1ddc5a1b 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -32,6 +32,7 @@ struct worker {
work_func_t current_func; /* K: function */
struct pool_workqueue *current_pwq; /* K: pwq */
u64 current_at; /* K: runtime at start or last wakeup */
+ unsigned long current_start; /* K: start time of current work item */
unsigned int current_color; /* K: color */
int sleeping; /* S: is worker sleeping? */