summaryrefslogtreecommitdiff
path: root/kernel/workqueue.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/workqueue.c')
-rw-r--r--kernel/workqueue.c166
1 files changed, 125 insertions, 41 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6..c515cff01828 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -117,6 +117,8 @@ enum wq_internal_consts {
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
+ RESCUER_BATCH = 16, /* process items per turn */
+
/*
* Rescue workers are used only on emergencies and shared by
* all cpus. Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
+ struct work_struct mayday_cursor; /* L: cursor on pool->worklist */
u64 stats[PWQ_NR_STATS];
@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
return NULL;
}
+static void mayday_cursor_func(struct work_struct *work)
+{
+ /* should not be processed, only for marking position */
+ BUG();
+}
+
/**
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
lockdep_assert_held(&pool->lock);
+ /* The cursor work should not be processed */
+ if (unlikely(work->func == mayday_cursor_func)) {
+ /* only worker_thread() can possibly take this branch */
+ WARN_ON_ONCE(worker->rescue_wq);
+ if (nextp)
+ *nextp = list_next_entry(work, entry);
+ list_del_init(&work->entry);
+ return false;
+ }
+
/*
* A single work shouldn't be executed concurrently by multiple workers.
* __queue_work() ensures that @work doesn't jump to a different pool
@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
reap_dying_workers(&cull_list);
}
-static void send_mayday(struct work_struct *work)
+static void send_mayday(struct pool_workqueue *pwq)
{
- struct pool_workqueue *pwq = get_work_pwq(work);
struct workqueue_struct *wq = pwq->wq;
lockdep_assert_held(&wq_mayday_lock);
@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
* rescuers.
*/
list_for_each_entry(work, &pool->worklist, entry)
- send_mayday(work);
+ send_mayday(get_work_pwq(work));
}
raw_spin_unlock(&wq_mayday_lock);
@@ -3440,22 +3458,57 @@ sleep:
static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
{
struct worker_pool *pool = pwq->pool;
+ struct work_struct *cursor = &pwq->mayday_cursor;
struct work_struct *work, *n;
- /* need rescue? */
- if (!pwq->nr_active || !need_to_create_worker(pool))
+ /* have work items to rescue? */
+ if (!pwq->nr_active)
return false;
- /*
- * Slurp in all works issued via this workqueue and
- * process'em.
- */
- list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
+ /* need rescue? */
+ if (!need_to_create_worker(pool)) {
+ /*
+ * The pool has idle workers and doesn't need the rescuer, so it
+ * could simply return false here.
+ *
+ * However, the memory pressure might not be fully relieved.
+ * In PERCPU pool with concurrency enabled, having idle workers
+ * does not necessarily mean memory pressure is gone; it may
+ * simply mean regular workers have woken up, completed their
+ * work, and gone idle again due to concurrency limits.
+ *
+ * In this case, those working workers may later sleep again,
+ * the pool may run out of idle workers, and it will have to
+ * allocate new ones and wait for the timer to send mayday,
+ * causing unnecessary delay - especially if memory pressure
+ * was never resolved throughout.
+ *
+ * Do more work if memory pressure is still on to reduce
+ * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
+ * not precisely, unless there are other PWQs needing help.
+ */
+ if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
+ !list_empty(&pwq->wq->maydays))
+ return false;
+ }
+
+ /* search from the start or cursor if available */
+ if (list_empty(&cursor->entry))
+ work = list_first_entry(&pool->worklist, struct work_struct, entry);
+ else
+ work = list_next_entry(cursor, entry);
+
+ /* find the next work item to rescue */
+ list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
pwq->stats[PWQ_STAT_RESCUED]++;
+ /* put the cursor for next search */
+ list_move_tail(&cursor->entry, &n->entry);
+ return true;
+ }
}
- return !list_empty(&rescuer->scheduled);
+ return false;
}
/**
@@ -3512,6 +3565,7 @@ repeat:
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
+ unsigned int count = 0;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -3524,31 +3578,27 @@ repeat:
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
- if (assign_rescuer_work(pwq, rescuer)) {
+ while (assign_rescuer_work(pwq, rescuer)) {
process_scheduled_works(rescuer);
/*
- * The above execution of rescued work items could
- * have created more to rescue through
- * pwq_activate_first_inactive() or chained
- * queueing. Let's put @pwq back on mayday list so
- * that such back-to-back work items, which may be
- * being used to relieve memory pressure, don't
- * incur MAYDAY_INTERVAL delay inbetween.
+ * If the per-turn work item limit is reached and other
+ * PWQs are in mayday, requeue mayday for this PWQ and
+ * let the rescuer handle the other PWQs first.
*/
- if (pwq->nr_active && need_to_create_worker(pool)) {
+ if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
+ pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
- /*
- * Queue iff somebody else hasn't queued it already.
- */
- if (list_empty(&pwq->mayday_node)) {
- get_pwq(pwq);
- list_add_tail(&pwq->mayday_node, &wq->maydays);
- }
+ send_mayday(pwq);
raw_spin_unlock(&wq_mayday_lock);
+ break;
}
}
+ /* The cursor can not be left behind without the rescuer watching it. */
+ if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
+ list_del_init(&pwq->mayday_cursor.entry);
+
/*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
+
+ /*
+ * Set the dummy cursor work with valid function and get_work_pwq().
+ *
+ * The cursor work should only be in the pwq->pool->worklist, and
+ * should not be treated as a processable work item.
+ *
+ * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
+ * surprise for kernel debugging tools and reviewers.
+ */
+ INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
+ atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -6959,13 +7022,16 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
/**
- * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
- * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
+ * workqueue_unbound_housekeeping_update - Propagate housekeeping cpumask update
+ * @hk: the new housekeeping cpumask
+ *
+ * Update the unbound workqueue cpumask on top of the new housekeeping cpumask such
+ * that the effective unbound affinity is the intersection of the new housekeeping
+ * with the requested affinity set via nohz_full=/isolcpus= or sysfs.
*
- * This function can be called from cpuset code to provide a set of isolated
- * CPUs that should be excluded from wq_unbound_cpumask.
+ * Return: 0 on success and -errno on failure.
*/
-int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
+int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
{
cpumask_var_t cpumask;
int ret = 0;
@@ -6981,14 +7047,14 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
* (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
* by any subsequent write to workqueue/cpumask sysfs file.
*/
- if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk))
cpumask_copy(cpumask, wq_requested_unbound_cpumask);
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
/* Save the current isolated cpumask & export it via sysfs */
if (!ret)
- cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, hk);
mutex_unlock(&wq_pool_mutex);
free_cpumask_var(cpumask);
@@ -7505,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
-static unsigned int wq_panic_on_stall;
+static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
+static unsigned int wq_panic_on_stall_time;
+module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
+MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
+
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
@@ -7559,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
rcu_read_unlock();
}
-static void panic_on_wq_watchdog(void)
+/*
+ * It triggers a panic in two scenarios: when the total number of stalls
+ * exceeds a threshold, and when a stall lasts longer than
+ * wq_panic_on_stall_time
+ */
+static void panic_on_wq_watchdog(unsigned int stall_time_sec)
{
static unsigned int wq_stall;
if (wq_panic_on_stall) {
wq_stall++;
- BUG_ON(wq_stall >= wq_panic_on_stall);
+ if (wq_stall >= wq_panic_on_stall)
+ panic("workqueue: %u stall(s) exceeded threshold %u\n",
+ wq_stall, wq_panic_on_stall);
}
+
+ if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
+ panic("workqueue: stall lasted %us, exceeding threshold %us\n",
+ stall_time_sec, wq_panic_on_stall_time);
}
static void wq_watchdog_reset_touched(void)
@@ -7581,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ unsigned int max_stall_time = 0;
bool lockup_detected = false;
bool cpu_pool_stall = false;
unsigned long now = jiffies;
struct worker_pool *pool;
+ unsigned int stall_time;
int pi;
if (!thresh)
@@ -7618,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
+ stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
+ max_stall_time = max(max_stall_time, stall_time);
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
pool->cpu_stall = true;
cpu_pool_stall = true;
}
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
- pr_cont(" stuck for %us!\n",
- jiffies_to_msecs(now - pool_ts) / 1000);
+ pr_cont(" stuck for %us!\n", stall_time);
}
@@ -7638,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
show_cpu_pools_hogs();
if (lockup_detected)
- panic_on_wq_watchdog();
+ panic_on_wq_watchdog(max_stall_time);
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);