From 5c0338c68706be53b3dc472e4308961c36e4ece1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jul 2017 18:41:52 -0400
Subject: workqueue: restore WQ_UNBOUND/max_active==1 to be ordered

The combination of WQ_UNBOUND and max_active == 1 used to imply
ordered execution.  After NUMA affinity 4c16bd327c74 ("workqueue:
implement NUMA affinity for unbound workqueues"), this is no longer
true due to per-node worker pools.

While the right way to create an ordered workqueue is
alloc_ordered_workqueue(), the documentation has been misleading for a
long time and people do use WQ_UNBOUND and max_active == 1 for ordered
workqueues which can lead to subtle bugs which are very difficult to
trigger.

It's unlikely that we'd see noticeable performance impact by enforcing
ordering on WQ_UNBOUND / max_active == 1 workqueues.  Let's
automatically set __WQ_ORDERED for those workqueues.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Christoph Hellwig <hch@infradead.org>
Reported-by: Alexei Potashnik <alexei@purestorage.com>
Fixes: 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues")
Cc: stable@vger.kernel.org # v3.10+
---
 kernel/workqueue.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688fabc55..abe4a4971c24 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3929,6 +3929,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	struct workqueue_struct *wq;
 	struct pool_workqueue *pwq;
 
+	/*
+	 * Unbound && max_active == 1 used to imply ordered, which is no
+	 * longer the case on NUMA machines due to per-node pools.  While
+	 * alloc_ordered_workqueue() is the right way to create an ordered
+	 * workqueue, keep the previous behavior to avoid subtle breakages
+	 * on NUMA.
+	 */
+	if ((flags & WQ_UNBOUND) && max_active == 1)
+		flags |= __WQ_ORDERED;
+
 	/* see the comment above the definition of WQ_POWER_EFFICIENT */
 	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
 		flags |= WQ_UNBOUND;
-- 
cgit v1.2.3


From 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jul 2017 08:36:15 -0400
Subject: workqueue: implicit ordered attribute should be overridable

5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be
ordered") automatically enabled ordered attribute for unbound
workqueues w/ max_active == 1.  Because ordered workqueues reject
max_active and some attribute changes, this implicit ordered mode
broke cases where the user creates an unbound workqueue w/ max_active
== 1 and later explicitly changes the related attributes.

This patch distinguishes explicit and implicit ordered setting and
overrides from attribute changes if implict.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered")
---
 kernel/workqueue.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index abe4a4971c24..7146ea70a62d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3744,8 +3744,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
 		return -EINVAL;
 
 	/* creating multiple pwqs breaks ordering guarantee */
-	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-		return -EINVAL;
+	if (!list_empty(&wq->pwqs)) {
+		if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+			return -EINVAL;
+
+		wq->flags &= ~__WQ_ORDERED;
+	}
 
 	ctx = apply_wqattrs_prepare(wq, attrs);
 	if (!ctx)
@@ -4129,13 +4133,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 	struct pool_workqueue *pwq;
 
 	/* disallow meddling with max_active for ordered workqueues */
-	if (WARN_ON(wq->flags & __WQ_ORDERED))
+	if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
 		return;
 
 	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
 
 	mutex_lock(&wq->mutex);
 
+	wq->flags &= ~__WQ_ORDERED;
 	wq->saved_max_active = max_active;
 
 	for_each_pwq(pwq, wq)
@@ -5263,7 +5268,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
 	 * attributes breaks ordering guarantee.  Disallow exposing ordered
 	 * workqueues.
 	 */
-	if (WARN_ON(wq->flags & __WQ_ORDERED))
+	if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
 		return -EINVAL;
 
 	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-- 
cgit v1.2.3


From 1ad0f0a7aa1bf3bd42dcd108a96713d255eacd9f Mon Sep 17 00:00:00 2001
From: Michael Bringmann <mwb@linux.vnet.ibm.com>
Date: Thu, 27 Jul 2017 16:27:14 -0500
Subject: workqueue: Work around edge cases for calc of pool's cpumask

There is an underlying assumption/trade-off in many layers of the Linux
system that CPU <-> node mapping is static.  This is despite the presence
of features like NUMA and 'hotplug' that support the dynamic addition/
removal of fundamental system resources like CPUs and memory.  PowerPC
systems, however, do provide extensive features for the dynamic change
of resources available to a system.

Currently, there is little or no synchronization protection around the
updating of the CPU <-> node mapping, and the export/update of this
information for other layers / modules.  In systems which can change
this mapping during 'hotplug', like PowerPC, the information is changing
underneath all layers that might reference it.

This patch attempts to ensure that a valid, usable cpumask attribute
is used by the workqueue infrastructure when setting up new resource
pools.  It prevents a crash that has been observed when an 'empty'
cpumask is passed along to the worker/task scheduling code.  It is
intended as a temporary workaround until a more fundamental review and
correction of the issue can be done.

[With additions to the patch provided by Tejun Hao <tj@kernel.org>]

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7146ea70a62d..ca937b0c3a96 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
 
 	/* yeap, return possible CPUs in @node that @attrs wants */
 	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+	if (cpumask_empty(cpumask)) {
+		pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+				"possible intersect\n");
+		return false;
+	}
+
 	return !cpumask_equal(cpumask, attrs->cpumask);
 
 use_dfl:
-- 
cgit v1.2.3


From 9a2614916ac564d6ea1d0a5cb986298bc508c3bf Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <bp@benjamin.pe>
Date: Sun, 6 Aug 2017 19:33:22 -0700
Subject: workqueue: fix path to documentation

Signed-off-by: Benjamin Peterson <bp@benjamin.pe>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688fabc55..4fa6c7650f09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -21,7 +21,7 @@
  * pools for workqueues which are not bound to any specific CPU - the
  * number of these backing pools is dynamic.
  *
- * Please read Documentation/workqueue.txt for details.
+ * Please read Documentation/core-api/workqueue.rst for details.
  */
 
 #include <linux/export.h>
-- 
cgit v1.2.3


From b09be676e0ff25bd6d2e7637e26d349f9109ad75 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Mon, 7 Aug 2017 16:12:52 +0900
Subject: locking/lockdep: Implement the 'crossrelease' feature

Lockdep is a runtime locking correctness validator that detects and
reports a deadlock or its possibility by checking dependencies between
locks. It's useful since it does not report just an actual deadlock but
also the possibility of a deadlock that has not actually happened yet.
That enables problems to be fixed before they affect real systems.

However, this facility is only applicable to typical locks, such as
spinlocks and mutexes, which are normally released within the context in
which they were acquired. However, synchronization primitives like page
locks or completions, which are allowed to be released in any context,
also create dependencies and can cause a deadlock.

So lockdep should track these locks to do a better job. The 'crossrelease'
implementation makes these primitives also be tracked.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: boqun.feng@gmail.com
Cc: kernel-team@lge.com
Cc: kirill@shutemov.name
Cc: npiggin@gmail.com
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/1502089981-21272-6-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca937b0c3a96..e86733a8b344 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2093,6 +2093,7 @@ __acquires(&pool->lock)
 
 	lock_map_acquire_read(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
+	crossrelease_hist_start(XHLOCK_PROC);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
@@ -2100,6 +2101,7 @@ __acquires(&pool->lock)
 	 * point will only record its address.
 	 */
 	trace_workqueue_execute_end(work);
+	crossrelease_hist_end(XHLOCK_PROC);
 	lock_map_release(&lockdep_map);
 	lock_map_release(&pwq->wq->lockdep_map);
 
-- 
cgit v1.2.3


From 52fa5bc5cbba089f09bc2c372e3432f3f3e48051 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 17 Aug 2017 17:46:12 +0800
Subject: locking/lockdep: Explicitly initialize wq_barrier::done::map

With the new lockdep crossrelease feature, which checks completions usage,
a false positive is reported in the workqueue code:

> Worker A : acquired of wfc.work -> wait for cpu_hotplug_lock to be released
> Task   B : acquired of cpu_hotplug_lock -> wait for lock#3 to be released
> Task   C : acquired of lock#3 -> wait for completion of barr->done
> (Task C is in lru_add_drain_all_cpuslocked())
> Worker D : wait for wfc.work to be released -> will complete barr->done

Such a dead lock can not happen because Task C's barr->done and Worker D's
barr->done can not be the same instance.

The reason of this false positive is we initialize all wq_barrier::done
at insert_wq_barrier() via init_completion(), which makes them belong to
the same lock class, therefore, impossible circles are reported.

To fix this, explicitly initialize the lockdep map for wq_barrier::done
in insert_wq_barrier(), so that the lock class key of wq_barrier::done
is a subkey of the corresponding work_struct, as a result we won't build
a dependency between a wq_barrier with a unrelated work, and we can
differ wq barriers based on the related works, so the false positive
above is avoided.

Also define the empty lockdep_init_map_crosslock() for !CROSSRELEASE
to make the code simple and away from unnecessary #ifdefs.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170817094622.12915-1-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e86733a8b344..f128b3becfe1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2476,7 +2476,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 	 */
 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
-	init_completion(&barr->done);
+
+	/*
+	 * Explicitly init the crosslock for wq_barrier::done, make its lock
+	 * key a subkey of the corresponding work. As a result we won't
+	 * build a dependency between wq_barrier::done and unrelated work.
+	 */
+	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
+				   "(complete)wq_barr::done",
+				   target->lockdep_map.key, 1);
+	__init_completion(&barr->done);
 	barr->task = current;
 
 	/*
-- 
cgit v1.2.3


From c5a94a618e7ac86b20f53d947f68d7cee6a4c6bc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Aug 2017 13:58:44 +0200
Subject: workqueue: Use TASK_IDLE

Workqueues don't use signals, it (ab)uses TASK_INTERRUPTIBLE to avoid
increasing the loadavg numbers. We've 'recently' introduced TASK_IDLE
for this case:

  80ed87c8a9ca ("sched/wait: Introduce TASK_NOLOAD and TASK_IDLE")

use it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fa6c7650f09..2d278b9a5469 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2247,7 +2247,7 @@ sleep:
 	 * event.
 	 */
 	worker_enter_idle(worker);
-	__set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_IDLE);
 	spin_unlock_irq(&pool->lock);
 	schedule();
 	goto woke_up;
@@ -2289,7 +2289,7 @@ static int rescuer_thread(void *__rescuer)
 	 */
 	rescuer->task->flags |= PF_WQ_WORKER;
 repeat:
-	set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_IDLE);
 
 	/*
 	 * By the time the rescuer is requested to stop, the workqueue
-- 
cgit v1.2.3


From a1d14934ea4b9db816a8dbfeab1c3e7204a0d871 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Aug 2017 12:52:32 +0200
Subject: workqueue/lockdep: 'Fix' flush_work() annotation

The flush_work() annotation as introduced by commit:

  e159489baa71 ("workqueue: relax lockdep annotation on flush_work()")

hits on the lockdep problem with recursive read locks.

The situation as described is:

Work W1:                Work W2:        Task:

ARR(Q)                  ARR(Q)		flush_workqueue(Q)
A(W1)                   A(W2)             A(Q)
  flush_work(W2)			  R(Q)
    A(W2)
    R(W2)
    if (special)
      A(Q)
    else
      ARR(Q)
    R(Q)

where: A - acquire, ARR - acquire-read-recursive, R - release.

Where under 'special' conditions we want to trigger a lock recursion
deadlock, but otherwise allow the flush_work(). The allowing is done
by using recursive read locks (ARR), but lockdep is broken for
recursive stuff.

However, there appears to be no need to acquire the lock if we're not
'special', so if we remove the 'else' clause things become much
simpler and no longer need the recursion thing at all.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: byungchul.park@lge.com
Cc: david@fromorbit.com
Cc: johannes@sipsolutions.net
Cc: oleg@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f128b3becfe1..8ad214dc15a9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2091,7 +2091,7 @@ __acquires(&pool->lock)
 
 	spin_unlock_irq(&pool->lock);
 
-	lock_map_acquire_read(&pwq->wq->lockdep_map);
+	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	crossrelease_hist_start(XHLOCK_PROC);
 	trace_workqueue_execute_start(work);
@@ -2826,16 +2826,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 	spin_unlock_irq(&pool->lock);
 
 	/*
-	 * If @max_active is 1 or rescuer is in use, flushing another work
-	 * item on the same workqueue may lead to deadlock.  Make sure the
-	 * flusher is not running on the same workqueue by verifying write
-	 * access.
+	 * Force a lock recursion deadlock when using flush_work() inside a
+	 * single-threaded or rescuer equipped workqueue.
+	 *
+	 * For single threaded workqueues the deadlock happens when the work
+	 * is after the work issuing the flush_work(). For rescuer equipped
+	 * workqueues the deadlock happens when the rescuer stalls, blocking
+	 * forward progress.
 	 */
-	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
 		lock_map_acquire(&pwq->wq->lockdep_map);
-	else
-		lock_map_acquire_read(&pwq->wq->lockdep_map);
-	lock_map_release(&pwq->wq->lockdep_map);
+		lock_map_release(&pwq->wq->lockdep_map);
+	}
 
 	return true;
 already_gone:
-- 
cgit v1.2.3


From e6f3faa734a00c606b7b06c6b9f15e5627d3245b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Aug 2017 13:23:30 +0200
Subject: locking/lockdep: Fix workqueue crossrelease annotation

The new completion/crossrelease annotations interact unfavourable with
the extant flush_work()/flush_workqueue() annotations.

The problem is that when a single work class does:

  wait_for_completion(&C)

and

  complete(&C)

in different executions, we'll build dependencies like:

  lock_map_acquire(W)
  complete_acquire(C)

and

  lock_map_acquire(W)
  complete_release(C)

which results in the dependency chain: W->C->W, which lockdep thinks
spells deadlock, even though there is no deadlock potential since
works are ran concurrently.

One possibility would be to change the work 'lock' to recursive-read,
but that would mean hitting a lockdep limitation on recursive locks.
Also, unconditinoally switching to recursive-read here would fail to
detect the actual deadlock on single-threaded workqueues, which do
have a problem with this.

For now, forcefully disregard these locks for crossrelease.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: byungchul.park@lge.com
Cc: david@fromorbit.com
Cc: johannes@sipsolutions.net
Cc: oleg@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ad214dc15a9..c0331891dec1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2093,7 +2093,28 @@ __acquires(&pool->lock)
 
 	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
-	crossrelease_hist_start(XHLOCK_PROC);
+	/*
+	 * Strictly speaking we should do start(PROC) without holding any
+	 * locks, that is, before these two lock_map_acquire()'s.
+	 *
+	 * However, that would result in:
+	 *
+	 *   A(W1)
+	 *   WFC(C)
+	 *		A(W1)
+	 *		C(C)
+	 *
+	 * Which would create W1->C->W1 dependencies, even though there is no
+	 * actual deadlock possible. There are two solutions, using a
+	 * read-recursive acquire on the work(queue) 'locks', but this will then
+	 * hit the lockdep limitation on recursive locks, or simly discard
+	 * these locks.
+	 *
+	 * AFAICT there is no possible deadlock scenario between the
+	 * flush_work() and complete() primitives (except for single-threaded
+	 * workqueues), so hiding them isn't a problem.
+	 */
+	crossrelease_hist_start(XHLOCK_PROC, true);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
-- 
cgit v1.2.3


From f52be5708076b75a045ac52c6fef3fffb8300525 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Aug 2017 10:59:39 +0200
Subject: locking/lockdep: Untangle xhlock history save/restore from task
 independence

Where XHLOCK_{SOFT,HARD} are save/restore points in the xhlocks[] to
ensure the temporal IRQ events don't interact with task state, the
XHLOCK_PROC is a fundament different beast that just happens to share
the interface.

The purpose of XHLOCK_PROC is to annotate independent execution inside
one task. For example workqueues, each work should appear to run in its
own 'pristine' 'task'.

Remove XHLOCK_PROC in favour of its own interface to avoid confusion.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: david@fromorbit.com
Cc: johannes@sipsolutions.net
Cc: kernel-team@lge.com
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20170829085939.ggmb6xiohw67micb@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0331891dec1..ab3c0dc8c7ed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2094,8 +2094,8 @@ __acquires(&pool->lock)
 	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	/*
-	 * Strictly speaking we should do start(PROC) without holding any
-	 * locks, that is, before these two lock_map_acquire()'s.
+	 * Strictly speaking we should mark the invariant state without holding
+	 * any locks, that is, before these two lock_map_acquire()'s.
 	 *
 	 * However, that would result in:
 	 *
@@ -2107,14 +2107,14 @@ __acquires(&pool->lock)
 	 * Which would create W1->C->W1 dependencies, even though there is no
 	 * actual deadlock possible. There are two solutions, using a
 	 * read-recursive acquire on the work(queue) 'locks', but this will then
-	 * hit the lockdep limitation on recursive locks, or simly discard
+	 * hit the lockdep limitation on recursive locks, or simply discard
 	 * these locks.
 	 *
 	 * AFAICT there is no possible deadlock scenario between the
 	 * flush_work() and complete() primitives (except for single-threaded
 	 * workqueues), so hiding them isn't a problem.
 	 */
-	crossrelease_hist_start(XHLOCK_PROC, true);
+	lockdep_invariant_state(true);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
@@ -2122,7 +2122,6 @@ __acquires(&pool->lock)
 	 * point will only record its address.
 	 */
 	trace_workqueue_execute_end(work);
-	crossrelease_hist_end(XHLOCK_PROC);
 	lock_map_release(&lockdep_map);
 	lock_map_release(&pwq->wq->lockdep_map);
 
-- 
cgit v1.2.3


From 5cd79d6abd2c142352dead0e3df04e86ee32f5d3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:00 -0700
Subject: timer: Remove users of TIMER_DEFERRED_INITIALIZER

This removes uses of TIMER_DEFERRED_INITIALIZER and chooses a location
to call timer_setup() from before add_timer() or mod_timer() is called.
Adjusts callbacks to use from_timer() as needed.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-7-git-send-email-keescook@chromium.org
---
 kernel/workqueue.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..a5361fc6215d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5390,11 +5390,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
  */
 #ifdef CONFIG_WQ_WATCHDOG
 
-static void wq_watchdog_timer_fn(unsigned long data);
-
 static unsigned long wq_watchdog_thresh = 30;
-static struct timer_list wq_watchdog_timer =
-	TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+static struct timer_list wq_watchdog_timer;
 
 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
@@ -5408,7 +5405,7 @@ static void wq_watchdog_reset_touched(void)
 		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
 }
 
-static void wq_watchdog_timer_fn(unsigned long data)
+static void wq_watchdog_timer_fn(struct timer_list *unused)
 {
 	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
 	bool lockup_detected = false;
@@ -5510,6 +5507,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
 
 static void wq_watchdog_init(void)
 {
+	timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
 	wq_watchdog_set_thresh(wq_watchdog_thresh);
 }
 
-- 
cgit v1.2.3


From 8c20feb60604d91a29cd7fef8ac758bd92d9fd2c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:07 -0700
Subject: workqueue: Convert callback to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch workqueue to use from_timer() and pass the
timer pointer explicitly.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-14-git-send-email-keescook@chromium.org
---
 kernel/workqueue.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a5361fc6215d..c77fdf6bf24f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1492,9 +1492,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL(queue_work_on);
 
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *t)
 {
-	struct delayed_work *dwork = (struct delayed_work *)__data;
+	struct delayed_work *dwork = from_timer(dwork, t, timer);
 
 	/* should have been called from irqsafe timer with irq already off */
 	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -1508,8 +1508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
-		     timer->data != (unsigned long)dwork);
+	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
-- 
cgit v1.2.3


From 692b48258dda7c302e777d7d5f4217244478f1f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Oct 2017 08:04:13 -0700
Subject: workqueue: replace pool->manager_arb mutex with a flag

Josef reported a HARDIRQ-safe -> HARDIRQ-unsafe lock order detected by
lockdep:

 [ 1270.472259] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
 [ 1270.472783] 4.14.0-rc1-xfstests-12888-g76833e8 #110 Not tainted
 [ 1270.473240] -----------------------------------------------------
 [ 1270.473710] kworker/u5:2/5157 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
 [ 1270.474239]  (&(&lock->wait_lock)->rlock){+.+.}, at: [<ffffffff8da253d2>] __mutex_unlock_slowpath+0xa2/0x280
 [ 1270.474994]
 [ 1270.474994] and this task is already holding:
 [ 1270.475440]  (&pool->lock/1){-.-.}, at: [<ffffffff8d2992f6>] worker_thread+0x366/0x3c0
 [ 1270.476046] which would create a new lock dependency:
 [ 1270.476436]  (&pool->lock/1){-.-.} -> (&(&lock->wait_lock)->rlock){+.+.}
 [ 1270.476949]
 [ 1270.476949] but this new dependency connects a HARDIRQ-irq-safe lock:
 [ 1270.477553]  (&pool->lock/1){-.-.}
 ...
 [ 1270.488900] to a HARDIRQ-irq-unsafe lock:
 [ 1270.489327]  (&(&lock->wait_lock)->rlock){+.+.}
 ...
 [ 1270.494735]  Possible interrupt unsafe locking scenario:
 [ 1270.494735]
 [ 1270.495250]        CPU0                    CPU1
 [ 1270.495600]        ----                    ----
 [ 1270.495947]   lock(&(&lock->wait_lock)->rlock);
 [ 1270.496295]                                local_irq_disable();
 [ 1270.496753]                                lock(&pool->lock/1);
 [ 1270.497205]                                lock(&(&lock->wait_lock)->rlock);
 [ 1270.497744]   <Interrupt>
 [ 1270.497948]     lock(&pool->lock/1);

, which will cause a irq inversion deadlock if the above lock scenario
happens.

The root cause of this safe -> unsafe lock order is the
mutex_unlock(pool->manager_arb) in manage_workers() with pool->lock
held.

Unlocking mutex while holding an irq spinlock was never safe and this
problem has been around forever but it never got noticed because the
only time the mutex is usually trylocked while holding irqlock making
actual failures very unlikely and lockdep annotation missed the
condition until the recent b9c16a0e1f73 ("locking/mutex: Fix
lockdep_assert_held() fail").

Using mutex for pool->manager_arb has always been a bit of stretch.
It primarily is an mechanism to arbitrate managership between workers
which can easily be done with a pool flag.  The only reason it became
a mutex is that pool destruction path wants to exclude parallel
managing operations.

This patch replaces the mutex with a new pool flag POOL_MANAGER_ACTIVE
and make the destruction path wait for the current manager on a wait
queue.

v2: Drop unnecessary flag clearing before pool destruction as
    suggested by Boqun.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..a2dccfe1acec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,7 @@ enum {
 	 * attach_mutex to avoid changing binding state while
 	 * worker_attach_to_pool() is in progress.
 	 */
+	POOL_MANAGER_ACTIVE	= 1 << 0,	/* being managed */
 	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
 
 	/* worker flags */
@@ -165,7 +166,6 @@ struct worker_pool {
 						/* L: hash of busy workers */
 
 	/* see manage_workers() for details on the two manager mutexes */
-	struct mutex		manager_arb;	/* manager arbitration */
 	struct worker		*manager;	/* L: purely informational */
 	struct mutex		attach_mutex;	/* attach/detach exclusion */
 	struct list_head	workers;	/* A: attached workers */
@@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 
 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
+static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
 
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
@@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = mutex_is_locked(&pool->manager_arb);
+	bool managing = pool->flags & POOL_MANAGER_ACTIVE;
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
@@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
 
-	/*
-	 * Anyone who successfully grabs manager_arb wins the arbitration
-	 * and becomes the manager.  mutex_trylock() on pool->manager_arb
-	 * failure while holding pool->lock reliably indicates that someone
-	 * else is managing the pool and the worker which failed trylock
-	 * can proceed to executing work items.  This means that anyone
-	 * grabbing manager_arb is responsible for actually performing
-	 * manager duties.  If manager_arb is grabbed and released without
-	 * actual management, the pool may stall indefinitely.
-	 */
-	if (!mutex_trylock(&pool->manager_arb))
+	if (pool->flags & POOL_MANAGER_ACTIVE)
 		return false;
+
+	pool->flags |= POOL_MANAGER_ACTIVE;
 	pool->manager = worker;
 
 	maybe_create_worker(pool);
 
 	pool->manager = NULL;
-	mutex_unlock(&pool->manager_arb);
+	pool->flags &= ~POOL_MANAGER_ACTIVE;
+	wake_up(&wq_manager_wait);
 	return true;
 }
 
@@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool)
 	setup_timer(&pool->mayday_timer, pool_mayday_timeout,
 		    (unsigned long)pool);
 
-	mutex_init(&pool->manager_arb);
 	mutex_init(&pool->attach_mutex);
 	INIT_LIST_HEAD(&pool->workers);
 
@@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool)
 	hash_del(&pool->hash_node);
 
 	/*
-	 * Become the manager and destroy all workers.  Grabbing
-	 * manager_arb prevents @pool's workers from blocking on
-	 * attach_mutex.
+	 * Become the manager and destroy all workers.  This prevents
+	 * @pool's workers from blocking on attach_mutex.  We're the last
+	 * manager and @pool gets freed with the flag set.
 	 */
-	mutex_lock(&pool->manager_arb);
-
 	spin_lock_irq(&pool->lock);
+	wait_event_lock_irq(wq_manager_wait,
+			    !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
+	pool->flags |= POOL_MANAGER_ACTIVE;
+
 	while ((worker = first_idle_worker(pool)))
 		destroy_worker(worker);
 	WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool)
 	if (pool->detach_completion)
 		wait_for_completion(pool->detach_completion);
 
-	mutex_unlock(&pool->manager_arb);
-
 	/* shut down the timers */
 	del_timer_sync(&pool->idle_timer);
 	del_timer_sync(&pool->mayday_timer);
-- 
cgit v1.2.3


From 32a6c7233c41216f5dd41fc7bf100eedb1063dfc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 15:58:25 -0700
Subject: workqueue: Convert timers to use timer_setup() (part 2)

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch to using the new timer_setup() and
from_timer() to pass the timer pointer explicitly. (The prior workqueue
patch missed a few timers.)

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Link: https://lkml.kernel.org/r/20171016225825.GA99101@beast
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c77fdf6bf24f..6e5eed58f215 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1831,9 +1831,9 @@ static void destroy_worker(struct worker *worker)
 	wake_up_process(worker->task);
 }
 
-static void idle_worker_timeout(unsigned long __pool)
+static void idle_worker_timeout(struct timer_list *t)
 {
-	struct worker_pool *pool = (void *)__pool;
+	struct worker_pool *pool = from_timer(pool, t, idle_timer);
 
 	spin_lock_irq(&pool->lock);
 
@@ -1879,9 +1879,9 @@ static void send_mayday(struct work_struct *work)
 	}
 }
 
-static void pool_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(struct timer_list *t)
 {
-	struct worker_pool *pool = (void *)__pool;
+	struct worker_pool *pool = from_timer(pool, t, mayday_timer);
 	struct work_struct *work;
 
 	spin_lock_irq(&pool->lock);
@@ -3241,11 +3241,9 @@ static int init_worker_pool(struct worker_pool *pool)
 	INIT_LIST_HEAD(&pool->idle_list);
 	hash_init(pool->busy_hash);
 
-	setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
-			       (unsigned long)pool);
+	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
 
-	setup_timer(&pool->mayday_timer, pool_mayday_timeout,
-		    (unsigned long)pool);
+	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
 	mutex_init(&pool->manager_arb);
 	mutex_init(&pool->attach_mutex);
-- 
cgit v1.2.3


From b5149873a0c299195b5346fe4dc2c5b04ae2f995 Mon Sep 17 00:00:00 2001
From: Tal Shorer <tal.shorer@gmail.com>
Date: Sat, 21 Oct 2017 19:29:24 +0300
Subject: workqueue: respect isolated cpus when queueing an unbound work

Initialize wq_unbound_cpumask to exclude cpus that were isolated by
the cmdline's isolcpus parameter.

Signed-off-by: Tal Shorer <tal.shorer@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..bfa433b38a61 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4980,6 +4980,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
+	/*
+	 * Not excluding isolated cpus on purpose.
+	 * If the user wishes to include them, we allow that.
+	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
@@ -5579,7 +5583,7 @@ int __init workqueue_init_early(void)
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+	cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From c95491ed6d6a958743d82bea1d053819988da418 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:22 -0700
Subject: locking/atomics, workqueue: Convert ACCESS_ONCE() to
 READ_ONCE()/WRITE_ONCE()

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't currently harmful.

However, for some features it is necessary to instrument reads and
writes separately, which is not possible with ACCESS_ONCE(). This
distinction is critical to correct operation.

It's possible to transform the bulk of kernel code using the Coccinelle
script below. However, this doesn't handle comments, leaving references
to ACCESS_ONCE() instances which have been removed. As a preparatory
step, this patch converts the workqueue code and comments to use
{READ,WRITE}_ONCE() consistently.

----
virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-12-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..39831b2f3c5f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4647,7 +4647,7 @@ static void rebind_workers(struct worker_pool *pool)
 		 * concurrency management.  Note that when or whether
 		 * @worker clears REBOUND doesn't affect correctness.
 		 *
-		 * ACCESS_ONCE() is necessary because @worker->flags may be
+		 * WRITE_ONCE() is necessary because @worker->flags may be
 		 * tested without holding any lock in
 		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may
 		 * fail incorrectly leading to premature concurrency
@@ -4656,7 +4656,7 @@ static void rebind_workers(struct worker_pool *pool)
 		WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
 		worker_flags |= WORKER_REBOUND;
 		worker_flags &= ~WORKER_UNBOUND;
-		ACCESS_ONCE(worker->flags) = worker_flags;
+		WRITE_ONCE(worker->flags, worker_flags);
 	}
 
 	spin_unlock_irq(&pool->lock);
-- 
cgit v1.2.3


From fd1a5b04dfb899f84ddeb8acdaea6b98283df1e5 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:04 +0900
Subject: workqueue: Remove now redundant lock acquisitions wrt. workqueue
 flushes

The workqueue code added manual lock acquisition annotations to catch
deadlocks.

After lockdepcrossrelease was introduced, some of those became redundant,
since wait_for_completion() already does the acquisition and tracking.

Remove the duplicate annotations.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-9-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 39831b2f3c5f..160fdc6e839a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2497,15 +2497,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 
-	/*
-	 * Explicitly init the crosslock for wq_barrier::done, make its lock
-	 * key a subkey of the corresponding work. As a result we won't
-	 * build a dependency between wq_barrier::done and unrelated work.
-	 */
-	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
-				   "(complete)wq_barr::done",
-				   target->lockdep_map.key, 1);
-	__init_completion(&barr->done);
+	init_completion_map(&barr->done, &target->lockdep_map);
+
 	barr->task = current;
 
 	/*
@@ -2611,16 +2604,13 @@ void flush_workqueue(struct workqueue_struct *wq)
 	struct wq_flusher this_flusher = {
 		.list = LIST_HEAD_INIT(this_flusher.list),
 		.flush_color = -1,
-		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+		.done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
 	};
 	int next_color;
 
 	if (WARN_ON(!wq_online))
 		return;
 
-	lock_map_acquire(&wq->lockdep_map);
-	lock_map_release(&wq->lockdep_map);
-
 	mutex_lock(&wq->mutex);
 
 	/*
@@ -2883,9 +2873,6 @@ bool flush_work(struct work_struct *work)
 	if (WARN_ON(!wq_online))
 		return false;
 
-	lock_map_acquire(&work->lockdep_map);
-	lock_map_release(&work->lockdep_map);
-
 	if (start_flush_work(work, &barr)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
-- 
cgit v1.2.3


From edbfd9112f70c34b2965580a67dad5fb306fb6c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2017 07:02:15 -0700
Subject: Revert "workqueue: respect isolated cpus when queueing an unbound
 work"

This reverts commit b5149873a0c299195b5346fe4dc2c5b04ae2f995.

It conflicts with the following isolcpus change from the sched branch.

 edb9382175c3 ("sched/isolation: Move isolcpus= handling to the housekeeping code")

Let's revert for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bfa433b38a61..64d0edf428f8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4980,10 +4980,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
-	/*
-	 * Not excluding isolated cpus on purpose.
-	 * If the user wishes to include them, we allow that.
-	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
@@ -5583,7 +5579,7 @@ int __init workqueue_init_early(void)
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map);
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 9a19b463863e757e649c37af245b6af101410c1e Mon Sep 17 00:00:00 2001
From: Wang Long <wanglong19@meituan.com>
Date: Thu, 2 Nov 2017 23:05:12 -0400
Subject: workqueue: Fix comment for unbound workqueue's attrbutes

Signed-off-by: Wang Long <wanglong19@meituan.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..5f99851bff09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5013,9 +5013,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
  *
  * Unbound workqueues have the following extra attributes.
  *
- *  id		RO int	: the associated pool ID
+ *  pool_ids	RO int	: the associated pool IDs for each node
  *  nice	RW int	: nice value of the workers
  *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ *  numa	RW bool	: whether enable NUMA affinity
  */
 struct wq_device {
 	struct workqueue_struct		*wq;
-- 
cgit v1.2.3


From 8e8eb730759f9cfd7a761b0b4ee41d714e720993 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:19 +0100
Subject: workqueue: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1509980490-4285-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1070b21ba4aa..13f67b5a0a0c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	 * queued or lose PENDING.  Grabbing PENDING and queueing should
 	 * happen with IRQ disabled.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	debug_work_activate(work);
 
-- 
cgit v1.2.3


From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 23 Oct 2017 09:40:42 +0200
Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts

With all callbacks converted, and the timer callback prototype
switched over, the TIMER_FUNC_TYPE cast is no longer needed,
so remove it. Conversion was done with the following scripts:

    perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \
        $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u)

    perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \
        $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u)

The now unused macros are also dropped from include/linux/timer.h.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dde6298f6b22..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
-- 
cgit v1.2.3